refactored commonmeta schema

front-matter · Feb 28, 2023 · aa92c1f · aa92c1f
1 parent 8241f43
commit aa92c1f
Show file tree

Hide file tree

Showing 49 changed files with 10,978 additions and 1,132 deletions.
diff --git a/commonmeta/constants.py b/commonmeta/constants.py
@@ -7,12 +7,11 @@ class Commonmeta(TypedDict):
 
     id: str
     type: str
-    doi: str
     url: str
     creators: List[dict]
     titles: List[dict]
-    publisher: str
-    date: Optional[dict]
+    publisher: dict
+    date: dict
     additional_type: Optional[str]
     subjects: Optional[List[dict]]
     contributors: Optional[List[dict]]
@@ -21,16 +20,12 @@ class Commonmeta(TypedDict):
     sizes: Optional[List[dict]]
     formats: Optional[List[dict]]
     version: Optional[str]
-    rights: Optional[List[dict]]
+    license: Optional[dict]
     descriptions: Optional[List[dict]]
     geo_locations: Optional[List[dict]]
     funding_references: Optional[List[dict]]
     references: Optional[List[dict]]
     container: Optional[dict]
-    date_created: Optional[str]
-    date_registered: Optional[str]
-    date_published: Optional[str]
-    date_updated: Optional[str]
     content_url: Optional[List[dict]]
     agency: Optional[str]
     state: str
@@ -182,20 +177,24 @@ class Commonmeta(TypedDict):
 CM_TO_CR_TRANSLATIONS = {
     "Article": "PostedContent",
     "BookChapter": "BookChapter",
-    "BookPart": "BookPart",
-    "BookSection": "BookSection",
     "BookSeries": "BookSeries",
-    "BookSet": "BookSet",
-    "BookTrack": "BookTrack",
     "Book": "Book",
     "Component": "Component",
-    "Database": "Database",
     "Dataset": "Dataset",
     "Dissertation": "Dissertation",
-    "EditedBook": "EditedBook",
+    "Grant": "Grant",
     "JournalArticle": "JournalArticle",
-    "Other": "Other",
+    "JournalIssue": "JournalIssue",
+    "JournalVolume": "JournalVolume",
+    "Journal": "Journal",
+    "ProceedingsArticle": "ProceedingsArticle",
+    "ProceedingsSeries": "ProceedingsSeries",
+    "Proceedings": "Proceedings",
+    "ReportComponent": "ReportComponent",
+    "ReportSeries": "ReportSeries",
+    "Report": "Report",
     "Review": "PeerReview",
+    "Other": "Other",
 }
 
 # source: https://github.com/datacite/schema/blob/master/source/meta/kernel-4/include/datacite-resourceType-v4.xsd

diff --git a/commonmeta/doi_utils.py b/commonmeta/doi_utils.py
@@ -79,6 +79,16 @@ def get_doi_ra(doi) -> Optional[str]:
     return response.json()[0].get("RA", None)
 
 
+def get_crossref_member(member_id) -> Optional[dict]:
+    """Return the Crossref member for a given member_id"""
+    response = requests.get("https://api.crossref.org/members/" + member_id, timeout=5)
+    if response.status_code != 200:
+        return None
+    data = response.json().get("message", None)
+    name = data.get("primary-name", None)
+    return {"id": "https://api.crossref.org/members/" + member_id, "name": name}
+
+
 def crossref_api_url(doi: str) -> str:
     """Return the Crossref API URL for a given DOI"""
     return "https://api.crossref.org/works/" + doi

diff --git a/commonmeta/metadata/metadata.py b/commonmeta/metadata/metadata.py
@@ -118,7 +118,7 @@ def __init__(self, string: Optional[str], **kwargs):
         self.sizes = meta.get("sizes")
         self.formats = meta.get("formats")
         self.version = meta.get("version")
-        self.rights = meta.get("rights")
+        self.license = meta.get("license")
         self.descriptions = meta.get("descriptions")
         self.geo_locations = meta.get("geo_locations")
         self.funding_references = meta.get("funding_references")
@@ -130,7 +130,7 @@ def __init__(self, string: Optional[str], **kwargs):
         self.date_updated = meta.get("date_updated")
         self.content_url = meta.get("content_url")
         self.container = meta.get("container")
-        self.agency = meta.get("agency")
+        self.provider = meta.get("provider")
         self.state = meta.get("state")
         self.schema_version = meta.get("schema_version")
         # citation style language options

diff --git a/commonmeta/readers/cff_reader.py b/commonmeta/readers/cff_reader.py
@@ -70,7 +70,7 @@ def read_cff(data: Optional[dict], **kwargs) -> Commonmeta:
         "published": get_iso8601_date(meta.get("date-released")) if meta.get("date-released", None) else None
     }
 
-    publisher = "GitHub" if url and url.startswith("https://github.com") else None
+    publisher = {"name": "GitHub"} if url and url.startswith("https://github.com") else None
 
     if meta.get("abstract", None):
         descriptions = [
@@ -84,10 +84,9 @@ def read_cff(data: Optional[dict], **kwargs) -> Commonmeta:
 
     subjects = [name_to_fos(i) for i in wrap(meta.get("keywords", None))]
 
-    if meta.get("licenseId", None):
-        rights = [dict_to_spdx({"rightsIdentifier": meta.get("licenseId")})]
-    else:
-        rights = None
+    license_ = meta.get("licenseId", None)
+    if license_ is not None:
+        license_ = dict_to_spdx({"id": meta.get("licenseId")})
 
     references = cff_references(wrap(meta.get("references", None)))
 
@@ -97,17 +96,17 @@ def read_cff(data: Optional[dict], **kwargs) -> Commonmeta:
         "id": id_,
         "type": type_,
         # 'identifiers' => identifiers,
-        "doi": doi_from_url(id_) if id_ else None,
         "url": url,
         "titles": titles,
         "creators": creators,
         "publisher": publisher,
         "references": presence(references),
         "date": date,
         "descriptions": presence(descriptions),
-        "rights": rights,
+        "license": license_,
         "version": meta.get("version", None),
         "subjects": presence(subjects),
+        "provider": "DataCite" if id_ else "GitHub",
         "state": state,
     } | read_options
 

diff --git a/commonmeta/readers/citeproc_reader.py b/commonmeta/readers/citeproc_reader.py
@@ -3,7 +3,7 @@
 from ..base_utils import wrap, compact, sanitize
 from ..author_utils import get_authors
 from ..date_utils import get_date_from_date_parts
-from ..doi_utils import doi_from_url
+from ..doi_utils import doi_from_url, get_doi_ra
 from ..constants import (
     CP_TO_CM_TRANSLATIONS,
     Commonmeta,
@@ -26,10 +26,9 @@ def read_citeproc(data: dict, **kwargs) -> Commonmeta:
 
     date = {'published': get_date_from_date_parts(meta.get("issued", None))}
 
-    if meta.get("copyright", None):
-        rights = [dict_to_spdx({"rightsURI": meta.get("copyright")})]
-    else:
-        rights = None
+    license_ = meta.get("copyright", None)
+    if license_ is not None:
+        license_ = dict_to_spdx({"url": meta.get("copyright")})
 
     pages = meta.get("page", "").split("-")
     container = compact(
@@ -58,6 +57,8 @@ def read_citeproc(data: dict, **kwargs) -> Commonmeta:
     else:
         descriptions = None
 
+    provider = get_doi_ra(id_)
+
     return {
         "id": id_,
         "type": type_,
@@ -71,8 +72,9 @@ def read_citeproc(data: dict, **kwargs) -> Commonmeta:
         "container": container,
         "references": None,
         "descriptions": descriptions,
-        "rights": rights,
+        "license": license_,
         "version": meta.get("version", None),
         "subjects": subjects,
+        "provider": provider,
         "state": state,
     } | read_options
diff --git a/commonmeta/readers/codemeta_reader.py b/commonmeta/readers/codemeta_reader.py
@@ -66,7 +66,7 @@ def read_codemeta(data: Optional[dict], **kwargs) -> Commonmeta:
     date['published'] = meta.get("datePublished", None)
     date['updated'] = meta.get("dateModified", None)
 
-    publisher = meta.get("publisher", None)
+    publisher = {"name": meta.get("publisher", None)}
 
     if meta.get("description", None):
         descriptions = [
@@ -86,17 +86,16 @@ def read_codemeta(data: Optional[dict], **kwargs) -> Commonmeta:
     else:
         titles = [{"title": has_title}]
 
-    if meta.get("licenseId", None):
-        rights = [dict_to_spdx({"rightsIdentifier": meta.get("licenseId")})]
-    else:
-        rights = None
-
+    license_ = meta.get("licenseId", None)
+    if license_:
+        license_ = dict_to_spdx({"id": meta.get("licenseId")})
+
+    provider = "DataCite" if doi_from_url(id_) else "GitHub"
     state = "findable" if meta or read_options else "not_found"
 
     return {
         "id": id_,
         "type": type_,
-        "doi": doi_from_url(id_) if id_ else None,
         "url": normalize_id(meta.get("codeRepository", None)),
         "identifiers": None,
         "titles": titles,
@@ -105,8 +104,9 @@ def read_codemeta(data: Optional[dict], **kwargs) -> Commonmeta:
         "publisher": publisher,
         "date": compact(date),
         "descriptions": descriptions,
-        "rights": rights,
+        "license": license_,
         "version": meta.get("version", None),
         "subjects": presence(subjects),
+        "provider": provider,
         "state": state,
     } | read_options
diff --git a/commonmeta/readers/crossref_reader.py b/commonmeta/readers/crossref_reader.py
@@ -13,7 +13,7 @@
 from ..base_utils import wrap, compact, presence, sanitize
 from ..author_utils import get_authors
 from ..date_utils import get_date_from_date_parts
-from ..doi_utils import doi_as_url, doi_from_url, get_doi_ra, crossref_api_url
+from ..doi_utils import doi_as_url, doi_from_url, get_doi_ra, get_crossref_member, crossref_api_url
 from ..constants import (
     CR_TO_CM_TRANSLATIONS,
     Commonmeta,
@@ -68,7 +68,13 @@ def editor_type(item):
         titles = [{"title": sanitize(title)}]
     else:
         titles = []
-    publisher = meta.get("publisher", None)
+
+    member_id = meta.get("member", None)
+    # TODO: get publisher from member_id almost always return publisher name, but sometimes does not
+    if member_id is not None:
+        publisher = get_crossref_member(member_id)
+    else:
+        publisher = meta.get("publisher", None)
 
     date: dict = {}
     date['submitted'] = None
@@ -80,9 +86,7 @@ def editor_type(item):
     license_ = meta.get("license", None)
     if license_ is not None:
         license_ = normalize_cc_url(license_[0].get("URL", None))
-        rights = [dict_to_spdx({"rightsUri": license_})] if license_ else None
-    else:
-        rights = None
+        license_ = dict_to_spdx({"url": license_}) if license_ else None
 
     issns = meta.get("issn-type", None)
     if issns is not None:
@@ -166,15 +170,15 @@ def editor_type(item):
         "sizes": None,
         "formats": None,
         "version": meta.get("version", None),
-        "rights": rights,
+        "license": license_,
         "descriptions": descriptions,
         "geo_locations": None,
         "funding_references": presence(funding_references),
         "references": references,
         # other properties
         "content_url": presence(meta.get("contentUrl", None)),
         "container": container,
-        "agency": get_doi_ra(id_),
+        "provider": get_doi_ra(id_),
         "state": state,
         "schema_version": None,
     } | read_options