Skip to content

Commit

Permalink
handle other inveniordm instances besides zenodo
Browse files Browse the repository at this point in the history
  • Loading branch information
mfenner committed Sep 28, 2024
1 parent 7a4cc01 commit eb0d5fe
Show file tree
Hide file tree
Showing 5 changed files with 65 additions and 80 deletions.
55 changes: 38 additions & 17 deletions commonmeta/readers/inveniordm_reader.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
"""InvenioRDM reader for Commonmeta"""
import httpx
from pydash import py_
from furl import furl

from ..utils import (
normalize_url,
normalize_doi,
dict_to_spdx,
name_to_fos,
from_inveniordm,
get_language,
)
from ..base_utils import compact, wrap, presence, sanitize
from ..author_utils import get_authors
Expand All @@ -33,36 +35,53 @@ def get_inveniordm(pid: str, **kwargs) -> dict:

def read_inveniordm(data: dict, **kwargs) -> Commonmeta:
"""read_inveniordm"""
print(data)
meta = data
read_options = kwargs or {}

_id = doi_as_url(meta.get("doi", None))
resource_type = py_.get(meta, "metadata.resource_type.type")
url = normalize_url(py_.get(meta, "links.self_html"))
_id = doi_as_url(meta.get("doi", None)) or url
resource_type = py_.get(meta, "metadata.resource_type.type") or py_.get(meta, "metadata.resource_type.id")
resource_type = resource_type.split("-")[0]
_type = INVENIORDM_TO_CM_TRANSLATIONS.get(resource_type, "Other")

contributors = py_.get(meta, "metadata.creators")
print(contributors)

contributors = get_authors(
from_inveniordm(wrap(py_.get(meta, "metadata.creators")))
from_inveniordm(wrap(contributors)),
)

publisher = {"name": meta.get("publisher", None) or "Zenodo"}
publisher = {"name": meta.get("publisher", None) or py_.get(meta, "metadata.publisher") or "Zenodo"}

title = py_.get(meta, "metadata.title")
print(title)
titles = [{"title": sanitize(title)}] if title else None
additional_titles = py_.get(meta, "metadata.additional_titles")
print(additional_titles)
# if additional_titles:
# titles += [{"title": sanitize("bla")} for i in wrap(additional_titles)]

date: dict = {}
date["published"] = py_.get(meta, ("metadata.publication_date"))
if date["published"]:
date["published"] = date["published"].split("/")[0]
date["updated"] = strip_milliseconds(meta.get("updated", None))
container = compact(
{
"id": "https://www.re3data.org/repository/r3d100010468",
"type": "DataRepository" if _type == "Dataset" else "Repository",
"title": "Zenodo",
}
)
f = furl(url)
if f.host == "zenodo.org":
container = compact(
{
"id": "https://www.re3data.org/repository/r3d100010468",
"type": "DataRepository" if _type == "Dataset" else "Repository",
"title": "Zenodo",
}
)
elif f.host in ["rogue-scholar.org", "beta.rogue-scholar.org", "demo.front-matter.io"]:
container = compact(
{
"type": "Repository",
"title": "Rogue Scholar",
}
)
else:
container = None
license_ = py_.get(meta, "metadata.license.id")
if license_:
license_ = dict_to_spdx({"id": license_})
Expand All @@ -73,7 +92,9 @@ def read_inveniordm(data: dict, **kwargs) -> Commonmeta:
py_.get(meta, "metadata.notes"),
]
)
language = py_.get(meta, "metadata.language")
language = py_.get(meta, "metadata.language") or py_.get(meta, "metadata.languages[0].id")
if language:
language = get_language(language).alpha_2
subjects = [name_to_fos(i) for i in wrap(py_.get(meta, "metadata.keywords"))]

references = get_references(wrap(py_.get(meta, "metadata.related_identifiers")))
Expand All @@ -92,7 +113,7 @@ def read_inveniordm(data: dict, **kwargs) -> Commonmeta:
"id": _id,
"type": _type,
"doi": doi_from_url(_id),
"url": normalize_url(py_.get(meta, "links.self_html")),
"url": url,
"contributors": presence(contributors),
"titles": titles,
"publisher": publisher,
Expand Down Expand Up @@ -169,7 +190,7 @@ def map_relation(relation: dict) -> dict:
"""map_relation"""
identifier = relation.get("identifier", None)
scheme = relation.get("scheme", None)
relation_type = relation.get("relation", None)
relation_type = relation.get("relation", None) or relation.get("relation_type", None)
if scheme == "doi":
identifier = doi_as_url(identifier)
else:
Expand Down
5 changes: 4 additions & 1 deletion commonmeta/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -373,6 +373,9 @@ def from_inveniordm(elements: list) -> list:
"""Convert from inveniordm elements"""

def format_element(element):
if "person_or_org" in element.keys():
element = element["person_or_org"]

"""format element"""
if not isinstance(element, dict):
return None
Expand Down Expand Up @@ -620,7 +623,7 @@ def find_from_format_by_id(pid: str) -> Optional[str]:
return "cff"
if re.match(r"\Ahttps:/(/)?api\.rogue-scholar\.org/posts/(.+)\Z", pid) is not None:
return "json_feed_item"
if re.match(r"\Ahttps:/(/)?zenodo\.org/api/records/(.+)\Z", pid) is not None:
if re.match(r"\Ahttps:/(/)(.+)/api/records/(.+)\Z", pid) is not None:
return "inveniordm"
return "schema_org"

Expand Down
10 changes: 5 additions & 5 deletions tests/cassettes/test-inveniordm_reader/test_rogue_scholar.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -113,21 +113,21 @@ interactions:
content-type:
- application/json
date:
- Fri, 27 Sep 2024 16:12:37 GMT
- Sat, 28 Sep 2024 15:43:31 GMT
etag:
- '"3"'
fly-request-id:
- 01J8T42VRHDAB6VV8Y8YB0YR8Q-cdg
- 01J8WMTA9D9EXVVKJNH68KVNQN-cdg
link:
- <https://demo.front-matter.io/api/records/kqfsz-qzd05> ; rel="linkset" ; type="application/linkset+json"
permissions-policy:
- interest-cohort=()
referrer-policy:
- strict-origin-when-cross-origin
retry-after:
- '59'
- '60'
server:
- Fly/0c45e4378 (2024-09-20)
- Fly/a06ddcf9d (2024-09-27)
strict-transport-security:
- max-age=31556926; includeSubDomains
transfer-encoding:
Expand All @@ -143,7 +143,7 @@ interactions:
x-ratelimit-remaining:
- '499'
x-ratelimit-reset:
- '1727453617'
- '1727538272'
x-xss-protection:
- 1; mode=block
status:
Expand Down
6 changes: 1 addition & 5 deletions tests/test-crossref_xml_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -536,11 +536,7 @@ def test_doi_with_sici():
"familyName": "Fenton",
}
assert subject.license == {"url": "https://doi.wiley.com/10.1002/tdm_license_1.1"}
assert subject.date == {
"created": "2007-06-04",
"published": "2006-11",
"updated": "2024-02-14",
}
assert subject.date["published"] == "2006-11"
assert subject.publisher == {
"id": "https://api.crossref.org/members/311",
"name": "Wiley",
Expand Down
69 changes: 17 additions & 52 deletions tests/test-inveniordm_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,7 @@ def test_dataset():
"type": "DataRepository",
"title": "Zenodo",
}
assert subject.language == "eng"
assert subject.language == "en"
assert subject.version == "162"
assert len(subject.files) == 24
assert subject.files[0] == {
Expand All @@ -281,68 +281,33 @@ def test_rogue_scholar():
string = "https://beta.rogue-scholar.org/api/records/kqfsz-qzd05"
subject = Metadata(string)
assert subject.is_valid
assert subject.id == "https://beta.rogue-scholar.org/api/records/kqfsz-qzd05"
assert subject.type == "WebPage"
assert subject.url == "https://beta.rogue-scholar.org/api/records/kqfsz-qzd05"
# assert subject.titles[0] == {
# "title": "The Origins of SARS-CoV-2: A Critical Review"
# }
print(subject)
assert len(subject.contributors) == 21
assert subject.id == "https://demo.front-matter.io/records/kqfsz-qzd05"
assert subject.type == "Image"
assert subject.url == "https://demo.front-matter.io/records/kqfsz-qzd05"
assert subject.titles[0] == {
"title": "Elliott Group's gallery"
}
assert len(subject.contributors) == 4
assert subject.contributors[0] == {
"type": "Person",
"contributorRoles": ["Author"],
"givenName": "Edward C",
"familyName": "Holmes",
"affiliations": [
{
"name": "School of Life and Environmental Sciences and School of Medical Sciences, The University of Sydney, Sydney, NSW 2006, Australia"
}
],
"givenName": "Phillip",
"familyName": "Burton",
}
assert subject.license == {
"id": "CC-BY-NC-ND-4.0",
"url": "https://creativecommons.org/licenses/by-nc-nd/4.0/legalcode",
}

assert subject.date == {
"published": "2021-08-18",
"updated": "2022-07-01T11:30:53Z",
}
assert subject.relations == [
{"id": "https://doi.org/10.5281/zenodo.5075887", "type": "IsVersionOf"},
]
assert subject.license is None
assert subject.date["published"] == "1994-02"
assert subject.publisher == {
"name": "Zenodo",
"name": "InvenioRDM",
}
assert subject.funding_references is None
assert (
subject.descriptions[0]
.get("description")
.startswith("The Origins of SARS-CoV-2: A Critical Review Holmes et al.")
.startswith("One state discussion green sit if.")
)
assert (
subject.descriptions[1]
.get("description")
.startswith("Authors' final peer-reviewed version.")
)
assert subject.subjects == [
{"subject": "sars-cov-2"},
{"subject": "covid-19"},
{"subject": "origins"},
{"subject": "zoonosis"},
]
assert subject.container == {
"id": "https://www.re3data.org/repository/r3d100010468",
"type": "Repository",
"title": "Zenodo",
}
assert subject.language is None
assert subject.version == "Authors' final version"
assert len(subject.files) == 3
assert subject.files[0] == {
"key": "Holmes_et_al_(2021)_Cell_Supplementary.pdf",
"checksum": "md5:bdb88fc94708d8fd7d87854031faa8ab",
"url": "https://zenodo.org/api/records/5244404/files/Holmes_et_al_(2021)_Cell_Supplementary.pdf/content",
"size": 197003,
"title": "Rogue Scholar",
}
assert subject.language == "en"
assert subject.version == "v0.0.1"

0 comments on commit eb0d5fe

Please sign in to comment.