Skip to content

Commit

Permalink
refactored commonmeta schema
Browse files Browse the repository at this point in the history
  • Loading branch information
mfenner committed Feb 28, 2023
1 parent 8241f43 commit aa92c1f
Show file tree
Hide file tree
Showing 49 changed files with 10,978 additions and 1,132 deletions.
29 changes: 14 additions & 15 deletions commonmeta/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,11 @@ class Commonmeta(TypedDict):

id: str
type: str
doi: str
url: str
creators: List[dict]
titles: List[dict]
publisher: str
date: Optional[dict]
publisher: dict
date: dict
additional_type: Optional[str]
subjects: Optional[List[dict]]
contributors: Optional[List[dict]]
Expand All @@ -21,16 +20,12 @@ class Commonmeta(TypedDict):
sizes: Optional[List[dict]]
formats: Optional[List[dict]]
version: Optional[str]
rights: Optional[List[dict]]
license: Optional[dict]
descriptions: Optional[List[dict]]
geo_locations: Optional[List[dict]]
funding_references: Optional[List[dict]]
references: Optional[List[dict]]
container: Optional[dict]
date_created: Optional[str]
date_registered: Optional[str]
date_published: Optional[str]
date_updated: Optional[str]
content_url: Optional[List[dict]]
agency: Optional[str]
state: str
Expand Down Expand Up @@ -182,20 +177,24 @@ class Commonmeta(TypedDict):
CM_TO_CR_TRANSLATIONS = {
"Article": "PostedContent",
"BookChapter": "BookChapter",
"BookPart": "BookPart",
"BookSection": "BookSection",
"BookSeries": "BookSeries",
"BookSet": "BookSet",
"BookTrack": "BookTrack",
"Book": "Book",
"Component": "Component",
"Database": "Database",
"Dataset": "Dataset",
"Dissertation": "Dissertation",
"EditedBook": "EditedBook",
"Grant": "Grant",
"JournalArticle": "JournalArticle",
"Other": "Other",
"JournalIssue": "JournalIssue",
"JournalVolume": "JournalVolume",
"Journal": "Journal",
"ProceedingsArticle": "ProceedingsArticle",
"ProceedingsSeries": "ProceedingsSeries",
"Proceedings": "Proceedings",
"ReportComponent": "ReportComponent",
"ReportSeries": "ReportSeries",
"Report": "Report",
"Review": "PeerReview",
"Other": "Other",
}

# source: https://github.com/datacite/schema/blob/master/source/meta/kernel-4/include/datacite-resourceType-v4.xsd
Expand Down
10 changes: 10 additions & 0 deletions commonmeta/doi_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,16 @@ def get_doi_ra(doi) -> Optional[str]:
return response.json()[0].get("RA", None)


def get_crossref_member(member_id) -> Optional[dict]:
"""Return the Crossref member for a given member_id"""
response = requests.get("https://api.crossref.org/members/" + member_id, timeout=5)
if response.status_code != 200:
return None
data = response.json().get("message", None)
name = data.get("primary-name", None)
return {"id": "https://api.crossref.org/members/" + member_id, "name": name}


def crossref_api_url(doi: str) -> str:
"""Return the Crossref API URL for a given DOI"""
return "https://api.crossref.org/works/" + doi
Expand Down
4 changes: 2 additions & 2 deletions commonmeta/metadata/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ def __init__(self, string: Optional[str], **kwargs):
self.sizes = meta.get("sizes")
self.formats = meta.get("formats")
self.version = meta.get("version")
self.rights = meta.get("rights")
self.license = meta.get("license")
self.descriptions = meta.get("descriptions")
self.geo_locations = meta.get("geo_locations")
self.funding_references = meta.get("funding_references")
Expand All @@ -130,7 +130,7 @@ def __init__(self, string: Optional[str], **kwargs):
self.date_updated = meta.get("date_updated")
self.content_url = meta.get("content_url")
self.container = meta.get("container")
self.agency = meta.get("agency")
self.provider = meta.get("provider")
self.state = meta.get("state")
self.schema_version = meta.get("schema_version")
# citation style language options
Expand Down
13 changes: 6 additions & 7 deletions commonmeta/readers/cff_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def read_cff(data: Optional[dict], **kwargs) -> Commonmeta:
"published": get_iso8601_date(meta.get("date-released")) if meta.get("date-released", None) else None
}

publisher = "GitHub" if url and url.startswith("https://github.com") else None
publisher = {"name": "GitHub"} if url and url.startswith("https://github.com") else None

if meta.get("abstract", None):
descriptions = [
Expand All @@ -84,10 +84,9 @@ def read_cff(data: Optional[dict], **kwargs) -> Commonmeta:

subjects = [name_to_fos(i) for i in wrap(meta.get("keywords", None))]

if meta.get("licenseId", None):
rights = [dict_to_spdx({"rightsIdentifier": meta.get("licenseId")})]
else:
rights = None
license_ = meta.get("licenseId", None)
if license_ is not None:
license_ = dict_to_spdx({"id": meta.get("licenseId")})

references = cff_references(wrap(meta.get("references", None)))

Expand All @@ -97,17 +96,17 @@ def read_cff(data: Optional[dict], **kwargs) -> Commonmeta:
"id": id_,
"type": type_,
# 'identifiers' => identifiers,
"doi": doi_from_url(id_) if id_ else None,
"url": url,
"titles": titles,
"creators": creators,
"publisher": publisher,
"references": presence(references),
"date": date,
"descriptions": presence(descriptions),
"rights": rights,
"license": license_,
"version": meta.get("version", None),
"subjects": presence(subjects),
"provider": "DataCite" if id_ else "GitHub",
"state": state,
} | read_options

Expand Down
14 changes: 8 additions & 6 deletions commonmeta/readers/citeproc_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from ..base_utils import wrap, compact, sanitize
from ..author_utils import get_authors
from ..date_utils import get_date_from_date_parts
from ..doi_utils import doi_from_url
from ..doi_utils import doi_from_url, get_doi_ra
from ..constants import (
CP_TO_CM_TRANSLATIONS,
Commonmeta,
Expand All @@ -26,10 +26,9 @@ def read_citeproc(data: dict, **kwargs) -> Commonmeta:

date = {'published': get_date_from_date_parts(meta.get("issued", None))}

if meta.get("copyright", None):
rights = [dict_to_spdx({"rightsURI": meta.get("copyright")})]
else:
rights = None
license_ = meta.get("copyright", None)
if license_ is not None:
license_ = dict_to_spdx({"url": meta.get("copyright")})

pages = meta.get("page", "").split("-")
container = compact(
Expand Down Expand Up @@ -58,6 +57,8 @@ def read_citeproc(data: dict, **kwargs) -> Commonmeta:
else:
descriptions = None

provider = get_doi_ra(id_)

return {
"id": id_,
"type": type_,
Expand All @@ -71,8 +72,9 @@ def read_citeproc(data: dict, **kwargs) -> Commonmeta:
"container": container,
"references": None,
"descriptions": descriptions,
"rights": rights,
"license": license_,
"version": meta.get("version", None),
"subjects": subjects,
"provider": provider,
"state": state,
} | read_options
16 changes: 8 additions & 8 deletions commonmeta/readers/codemeta_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def read_codemeta(data: Optional[dict], **kwargs) -> Commonmeta:
date['published'] = meta.get("datePublished", None)
date['updated'] = meta.get("dateModified", None)

publisher = meta.get("publisher", None)
publisher = {"name": meta.get("publisher", None)}

if meta.get("description", None):
descriptions = [
Expand All @@ -86,17 +86,16 @@ def read_codemeta(data: Optional[dict], **kwargs) -> Commonmeta:
else:
titles = [{"title": has_title}]

if meta.get("licenseId", None):
rights = [dict_to_spdx({"rightsIdentifier": meta.get("licenseId")})]
else:
rights = None

license_ = meta.get("licenseId", None)
if license_:
license_ = dict_to_spdx({"id": meta.get("licenseId")})

provider = "DataCite" if doi_from_url(id_) else "GitHub"
state = "findable" if meta or read_options else "not_found"

return {
"id": id_,
"type": type_,
"doi": doi_from_url(id_) if id_ else None,
"url": normalize_id(meta.get("codeRepository", None)),
"identifiers": None,
"titles": titles,
Expand All @@ -105,8 +104,9 @@ def read_codemeta(data: Optional[dict], **kwargs) -> Commonmeta:
"publisher": publisher,
"date": compact(date),
"descriptions": descriptions,
"rights": rights,
"license": license_,
"version": meta.get("version", None),
"subjects": presence(subjects),
"provider": provider,
"state": state,
} | read_options
18 changes: 11 additions & 7 deletions commonmeta/readers/crossref_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from ..base_utils import wrap, compact, presence, sanitize
from ..author_utils import get_authors
from ..date_utils import get_date_from_date_parts
from ..doi_utils import doi_as_url, doi_from_url, get_doi_ra, crossref_api_url
from ..doi_utils import doi_as_url, doi_from_url, get_doi_ra, get_crossref_member, crossref_api_url
from ..constants import (
CR_TO_CM_TRANSLATIONS,
Commonmeta,
Expand Down Expand Up @@ -68,7 +68,13 @@ def editor_type(item):
titles = [{"title": sanitize(title)}]
else:
titles = []
publisher = meta.get("publisher", None)

member_id = meta.get("member", None)
# TODO: get publisher from member_id almost always return publisher name, but sometimes does not
if member_id is not None:
publisher = get_crossref_member(member_id)
else:
publisher = meta.get("publisher", None)

date: dict = {}
date['submitted'] = None
Expand All @@ -80,9 +86,7 @@ def editor_type(item):
license_ = meta.get("license", None)
if license_ is not None:
license_ = normalize_cc_url(license_[0].get("URL", None))
rights = [dict_to_spdx({"rightsUri": license_})] if license_ else None
else:
rights = None
license_ = dict_to_spdx({"url": license_}) if license_ else None

issns = meta.get("issn-type", None)
if issns is not None:
Expand Down Expand Up @@ -166,15 +170,15 @@ def editor_type(item):
"sizes": None,
"formats": None,
"version": meta.get("version", None),
"rights": rights,
"license": license_,
"descriptions": descriptions,
"geo_locations": None,
"funding_references": presence(funding_references),
"references": references,
# other properties
"content_url": presence(meta.get("contentUrl", None)),
"container": container,
"agency": get_doi_ra(id_),
"provider": get_doi_ra(id_),
"state": state,
"schema_version": None,
} | read_options
Expand Down
Loading

0 comments on commit aa92c1f

Please sign in to comment.