Skip to content

Commit

Permalink
Merge pull request #318 from ckan/multilingual
Browse files Browse the repository at this point in the history
Multilingual support in DCAT profiles
  • Loading branch information
amercader authored Oct 31, 2024
2 parents 1e945b6 + 2ca23ee commit ac1c34b
Show file tree
Hide file tree
Showing 17 changed files with 1,247 additions and 102 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,8 @@ jobs:
pip install -r ckanext-harvest/requirements.txt
git clone https://github.com/ckan/ckanext-scheming
pip install -e ckanext-scheming
git clone https://github.com/ckan/ckanext-fluent
pip install -e ckanext-fluent
- name: Setup extension
run: |
ckan -c test.ini db init
Expand Down
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ To run the tests do:

pytest --ckan-ini=test.ini ckanext/dcat/tests

Note that there are tests relying on having [ckanext-harvest](https://github.com/ckan/ckanext-harvest), [ckanext-scheming](https://github.com/ckan/ckanext-scheming) and [ckanext-fluent](https://github.com/ckan/ckanext-fluent) installed.

## Releases

To create a new release, follow these steps:
Expand Down
167 changes: 148 additions & 19 deletions ckanext/dcat/profiles/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ class URIRefOrLiteral(object):
Like CleanedURIRef, this is a factory class.
"""

def __new__(cls, value):
def __new__(cls, value, lang=None):
try:
stripped_value = value.strip()
if isinstance(value, str) and (
Expand All @@ -83,10 +83,10 @@ def __new__(cls, value):
# URI is fine, return the object
return uri_obj
else:
return Literal(value)
return Literal(value, lang=lang)
except Exception:
# In case something goes wrong: use Literal
return Literal(value)
return Literal(value, lang=lang)


class CleanedURIRef(object):
Expand Down Expand Up @@ -123,6 +123,8 @@ class RDFProfile(object):

_dataset_schema = None

_form_languages = None

# Cache for mappings of licenses URL/title to ID built when needed in
# _license().
_licenceregister_cache = None
Expand All @@ -145,6 +147,9 @@ def __init__(self, graph, dataset_type="dataset", compatibility_mode=False):

self.compatibility_mode = compatibility_mode

self._default_lang = config.get("ckan.locale_default", "en")


try:
schema_show = get_action("scheming_dataset_schema_show")
try:
Expand All @@ -157,6 +162,9 @@ def __init__(self, graph, dataset_type="dataset", compatibility_mode=False):
except KeyError:
pass

if self._dataset_schema:
self._form_languages = self._dataset_schema.get("form_languages")

def _datasets(self):
"""
Generator that returns all DCAT datasets on the graph
Expand Down Expand Up @@ -201,21 +209,40 @@ def _object(self, subject, predicate):
return _object
return None

def _object_value(self, subject, predicate):
def _object_value(self, subject, predicate, multilingual=False):
"""
Given a subject and a predicate, returns the value of the object
Both subject and predicate must be rdflib URIRef or BNode objects
If found, the string representation is returned, else an empty string
If multilingual is True, a dict with the language codes as keys will be
returned for each language found. e.g.
{
"en": "Dataset title",
"es": "Título del conjunto de datos"
}
If one of the languages defined in `form_languages` in the schema is not
found in the graph, an empty string will be returned.
{
"en": "Dataset title",
"es": ""
}
"""
default_lang = config.get("ckan.locale_default", "en")
if multilingual:
return self._object_value_multilingual(subject, predicate)
fallback = ""
for o in self.g.objects(subject, predicate):
if isinstance(o, Literal):
if o.language and o.language == default_lang:
if o.language and o.language == self._default_lang:
return str(o)
# Use first object as fallback if no object with the default language is available
# Use first object as fallback if no object with the default
# language is available
elif fallback == "":
fallback = str(o)
elif len(list(self.g.objects(o, RDFS.label))):
Expand All @@ -224,6 +251,31 @@ def _object_value(self, subject, predicate):
return str(o)
return fallback

def _object_value_multilingual(self, subject, predicate):
out = {}
for o in self.g.objects(subject, predicate):

if isinstance(o, Literal):
if o.language:
out[o.language] = str(o)
else:
out[self._default_lang] = str(o)
elif len(list(self.g.objects(o, RDFS.label))):
for label in self.g.objects(o, RDFS.label):
if label.language:
out[label.language] = str(label)
else:
out[self._default_lang] = str(label)
else:
out[self._default_lang] = str(o)

if self._form_languages:
for lang in self._form_languages:
if lang not in out:
out[lang] = ""

return out

def _object_value_multiple_predicate(self, subject, predicates):
"""
Given a subject and a list of predicates, returns the value of the object
Expand Down Expand Up @@ -301,10 +353,45 @@ def _object_value_list(self, subject, predicate):
Both subject and predicate must be rdflib URIRef or BNode objects
If no values found, returns an empty string
If no values found, returns an empty list
"""
return [str(o) for o in self.g.objects(subject, predicate)]

def _object_value_list_multilingual(self, subject, predicate):
"""
Given a subject and a predicate, returns a dict with the language codes
as keys and the list of object values as values. e.g.
{
"en": ["Oaks", "Pines"],
"es": ["Robles", "Pinos"],
}
If one of the languages defined in `form_languages` in the schema is not
found in the graph, an empty list will be returned.
{
"en": ["Oaks", "Pines"],
"es": [],
}
Both subject and predicate must be rdflib URIRef or BNode objects
If no values found, returns an empty list
"""
out = {}
for o in self.g.objects(subject, predicate):
lang = o.language or self._default_lang
if lang not in out:
out[lang] = []
out[lang].append(str(o))

if self._form_languages:
for lang in self._form_languages:
if lang not in out:
out[lang] = []
return out

def _get_vcard_property_value(
self, subject, predicate, predicate_string_property=None
):
Expand Down Expand Up @@ -786,18 +873,25 @@ def _add_statement_to_graph(self, data_dict, key, subject, predicate, _class=Non
"""
value = self._get_dict_value(data_dict, key)
if value:
_object = URIRefOrLiteral(value)
if isinstance(_object, Literal):
statement_ref = BNode()
self.g.add((subject, predicate, statement_ref))
if _class:
self.g.add((statement_ref, RDF.type, _class))
self.g.add((statement_ref, RDFS.label, _object))

if isinstance(value, dict):
_objects = []
for lang in value:
_objects.append(URIRefOrLiteral(value[lang], lang))
else:
self.g.add((subject, predicate, _object))
if _class:
self.g.add((_object, RDF.type, _class))
_objects = [URIRefOrLiteral(value)]
statement_ref = None
for _object in _objects:
if isinstance(_object, Literal):
if not statement_ref:
statement_ref = BNode()
self.g.add((subject, predicate, statement_ref))
if _class:
self.g.add((statement_ref, RDF.type, _class))
self.g.add((statement_ref, RDFS.label, _object))
else:
self.g.add((subject, predicate, _object))
if _class:
self.g.add((_object, RDF.type, _class))

def _schema_field(self, key):
"""
Expand All @@ -823,6 +917,32 @@ def _schema_resource_field(self, key):
if field["field_name"] == key:
return field

def _multilingual_dataset_fields(self):
"""
Return a list of field names in the dataset shema that have multilingual
values (i.e. that use one of the fluent presets)
"""
return self._multilingual_fields(entity="dataset")

def _multilingual_resource_fields(self):
"""
Return a list of field names in the resource schema that have multilingual
values (i.e. that use one of the fluent presets)
"""
return self._multilingual_fields(entity="resource")

def _multilingual_fields(self, entity="dataset"):
if not self._dataset_schema:
return []

out = []
for field in self._dataset_schema[f"{entity}_fields"]:
if field.get("validators") and any(
v for v in field["validators"].split() if v.startswith("fluent")
):
out.append(field["field_name"])
return out

def _set_dataset_value(self, dataset_dict, key, value):
"""
Sets the value for a given key in a CKAN dataset dict
Expand Down Expand Up @@ -949,7 +1069,16 @@ def _add_triple_from_dict(
elif value and date_value:
self._add_date_triple(subject, predicate, value, _type)
elif value:
# If it is a dict, we assume it's a fluent multilingual field
if isinstance(value, dict):
# We assume that all translated field values are Literals
for lang, translated_value in value.items():
object = Literal(translated_value, datatype=_datatype, lang=lang)
self.g.add((subject, predicate, object))
return

# Normal text value

# ensure URIRef items are preprocessed (space removal/url encoding)
if _type == URIRef:
_type = CleanedURIRef
Expand Down
Loading

0 comments on commit ac1c34b

Please sign in to comment.