From dfd6dc310ad076c35a5d4a7360fb011267ad7ccc Mon Sep 17 00:00:00 2001 From: Fatimah Zulfiqar Date: Tue, 3 Dec 2024 14:04:02 +0100 Subject: [PATCH 1/2] exporters: added dcat serializer --- invenio.cfg | 9 ++++ site/zenodo_rdm/serializers/__init__.py | 2 + site/zenodo_rdm/serializers/dcat.py | 69 +++++++++++++++++++++++++ 3 files changed, 80 insertions(+) create mode 100644 site/zenodo_rdm/serializers/dcat.py diff --git a/invenio.cfg b/invenio.cfg index 7604e975..3ce8490a 100644 --- a/invenio.cfg +++ b/invenio.cfg @@ -1118,6 +1118,15 @@ APP_RDM_RECORD_EXPORTERS = { "params": {}, "content-type": "application/vnd.datacite.datacite+xml", "filename": "{id}.xml", + }, + "dcat-ap": { + "name": _("DCAT"), + "serializer": ( + "zenodo_rdm.serializers:ZenodoDCATSerializer" + ), + "params": {}, + "content-type": "application/dcat+xml", + "filename": "{id}.xml", }, "cff": { "name": _("Citation File Format"), diff --git a/site/zenodo_rdm/serializers/__init__.py b/site/zenodo_rdm/serializers/__init__.py index 9e631dab..84d6f0ca 100644 --- a/site/zenodo_rdm/serializers/__init__.py +++ b/site/zenodo_rdm/serializers/__init__.py @@ -10,6 +10,7 @@ from .cff import ZenodoCFFSerializer from .codemeta import ZenodoCodemetaSerializer from .datacite import ZenodoDataciteJSONSerializer, ZenodoDataciteXMLSerializer +from .dcat import ZenodoDCATSerializer __all__ = ( "ZenodoBibtexSerializer", @@ -17,4 +18,5 @@ "ZenodoDataciteJSONSerializer", "ZenodoDataciteXMLSerializer", "ZenodoCFFSerializer", + "ZenodoDCATSerializer", ) diff --git a/site/zenodo_rdm/serializers/dcat.py b/site/zenodo_rdm/serializers/dcat.py new file mode 100644 index 00000000..0a537dc3 --- /dev/null +++ b/site/zenodo_rdm/serializers/dcat.py @@ -0,0 +1,69 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2024 CERN. +# +# ZenodoRDM is free software; you can redistribute it and/or modify +# it under the terms of the MIT License; see LICENSE file for more details. +"""Zenodo dcat serializer.""" + + +import idutils +from datacite import schema43 +from invenio_rdm_records.resources.serializers.dcat import DCATSerializer +from lxml import etree + + +class ZenodoDCATSerializer(DCATSerializer): + """Zenodo DCAT Serializer.""" + + def __init__(self, **options): + """Constructor.""" + super().__init__(**options) + + def add_missing_creator_link(self, rdf_tree): + """Add `rdf:about` attributes to within if missing.""" + namespaces = rdf_tree.nsmap + creators = rdf_tree.xpath( + "//dct:creator/rdf:Description[not(@rdf:about)]", namespaces=namespaces + ) + + for description in creators: + identifier_elem = description.find("dct:identifier", namespaces) + if identifier_elem is not None: + identifier = identifier_elem.text.strip() + schemes = idutils.detect_identifier_schemes(identifier) + rdf_about_url = next( + ( + idutils.to_url(identifier, scheme=scheme) + for scheme in schemes + if idutils.to_url(identifier, scheme) + ), + None, + ) + if rdf_about_url: + description.set( + "{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about", + rdf_about_url, + ) + return rdf_tree + + def transform_with_xslt(self, dc_record, **kwargs): + """Transform record with XSLT and add rdf:about.""" + # Transform with base class functionality + dc_etree = schema43.dump_etree(dc_record) + dc_namespace = schema43.ns[None] + dc_etree.tag = "{{{0}}}resource".format(dc_namespace) + dcat_etree = self.xslt_transform_func(dc_etree).getroot() + + # Add the identifier links for creators if missing + dcat_etree = self.add_missing_creator_link(dcat_etree) + + # Inject files in results (since the XSLT can't do that by default) + files_data = dc_record.get("_files", []) + if files_data: + self._add_files( + root=dcat_etree, + files=files_data, + ) + + return dcat_etree From 1d5ce7d173be6af212c1c21f23cdb970c8343781 Mon Sep 17 00:00:00 2001 From: Fatimah Zulfiqar Date: Tue, 3 Dec 2024 16:36:39 +0100 Subject: [PATCH 2/2] dcat: updated serializer for subject info --- site/zenodo_rdm/serializers/dcat.py | 50 +++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/site/zenodo_rdm/serializers/dcat.py b/site/zenodo_rdm/serializers/dcat.py index 0a537dc3..04b78f85 100644 --- a/site/zenodo_rdm/serializers/dcat.py +++ b/site/zenodo_rdm/serializers/dcat.py @@ -20,6 +20,51 @@ def __init__(self, **options): """Constructor.""" super().__init__(**options) + def add_subjects_uri(self, rdf_tree, subjects): + """Add valueURI of subjects to the corresponding dct:subject elements in the RDF tree.""" + namespaces = rdf_tree.nsmap + for subject in subjects: + value_uri = subject.get("valueURI") + subject_label = subject.get("subject") + subject_scheme = subject.get("subjectScheme") + subject_props = subject.get("subjectProps", {}) + + if value_uri and subject_label and subject_scheme: + # Find the corresponding dct:subject element by prefLabel and subjectScheme + subject_element = rdf_tree.xpath( + f""" + //dct:subject[ + skos:Concept[ + skos:prefLabel[text()='{subject_label}'] + and skos:inScheme/skos:ConceptScheme/dct:title[text()='{subject_scheme}'] + ] + ] + """, + namespaces=namespaces, + )[0] + + if subject_element: + # Add the valueURI to the dct:subject element as rdf:about + subject_element.set( + "{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about", value_uri + ) + + # Check if + # subject has a definition in its props + definition = subject_props.get("definition") + if definition: + concept_elem = subject_element.find( + ".//skos:Concept", namespaces=namespaces + ) + if concept_elem is not None: + skos_definition = etree.Element( + "{http://www.w3.org/2004/02/skos/core#}definition" + ) + skos_definition.text = definition + concept_elem.append(skos_definition) + + return rdf_tree + def add_missing_creator_link(self, rdf_tree): """Add `rdf:about` attributes to within if missing.""" namespaces = rdf_tree.nsmap @@ -55,6 +100,11 @@ def transform_with_xslt(self, dc_record, **kwargs): dc_etree.tag = "{{{0}}}resource".format(dc_namespace) dcat_etree = self.xslt_transform_func(dc_etree).getroot() + # Add valueURI to subjects + subjects = dc_record.get("subjects", []) + if subjects: + dcat_etree = self.add_subjects_uri(dcat_etree, subjects) + # Add the identifier links for creators if missing dcat_etree = self.add_missing_creator_link(dcat_etree)