Skip to content

Commit

Permalink
fix language parsing
Browse files Browse the repository at this point in the history
  • Loading branch information
mfenner committed Sep 29, 2024
1 parent eb0d5fe commit 30ebd3d
Show file tree
Hide file tree
Showing 18 changed files with 366 additions and 356 deletions.
4 changes: 1 addition & 3 deletions commonmeta/readers/inveniordm_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,8 +93,6 @@ def read_inveniordm(data: dict, **kwargs) -> Commonmeta:
]
)
language = py_.get(meta, "metadata.language") or py_.get(meta, "metadata.languages[0].id")
if language:
language = get_language(language).alpha_2
subjects = [name_to_fos(i) for i in wrap(py_.get(meta, "metadata.keywords"))]

references = get_references(wrap(py_.get(meta, "metadata.related_identifiers")))
Expand All @@ -121,7 +119,7 @@ def read_inveniordm(data: dict, **kwargs) -> Commonmeta:
# recommended and optional properties
# "additional_type": additional_type,
"subjects": presence(subjects),
"language": language,
"language": get_language(language),
"version": py_.get(meta, "metadata.version"),
"license": presence(license_),
"descriptions": descriptions,
Expand Down
3 changes: 2 additions & 1 deletion commonmeta/readers/schema_org_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
normalize_ids,
normalize_url,
name_to_fos,
get_language,
)
from ..readers.crossref_reader import get_crossref
from ..readers.datacite_reader import get_datacite
Expand Down Expand Up @@ -292,7 +293,7 @@ def read_schema_org(data: Optional[dict], **kwargs) -> Commonmeta:
# recommended and optional attributes
"additional_type": additional_type,
"subjects": presence(subjects),
"language": language,
"language": get_language(language),
"identifiers": identifiers,
"sizes": None,
"formats": None,
Expand Down
18 changes: 14 additions & 4 deletions commonmeta/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1043,9 +1043,10 @@ def issn_as_url(issn: str) -> Optional[str]:
return f"https://portal.issn.org/resource/ISSN/{issn}"


def get_language(lang: str) -> Optional[dict]:
"""Provide a language object based on ISO 639, with either a name in English,
ISO 639-1, or ISO 639-3 code as input.
def get_language(lang: str, format: str="alpha_2") -> Optional[str]:
"""Provide a language string based on ISO 639, with either a name in English,
ISO 639-1, or ISO 639-3 code as input. Optionally format as alpha_2 (defaul),
alpha_3, or name.
"""
if not lang:
return None
Expand All @@ -1055,7 +1056,16 @@ def get_language(lang: str) -> Optional[dict]:
language = pycountry.languages.get(alpha_3=lang)
else:
language = pycountry.languages.get(name=lang)
return language

if language is None:
return None
elif format == "name":
return language.name
elif format == "alpha_3":
return language.alpha_3

else:
return language.alpha_2


def start_case(content: str) -> str:
Expand Down
4 changes: 2 additions & 2 deletions commonmeta/writers/bibtex_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from bibtexparser.bibdatabase import BibDatabase
from bibtexparser.customization import page_double_hyphen

from ..utils import pages_as_string
from ..utils import pages_as_string, get_language
from ..base_utils import compact
from ..author_utils import authors_as_string
from ..date_utils import get_month_from_date, get_iso8601_date, MONTH_SHORT_NAMES
Expand Down Expand Up @@ -82,7 +82,7 @@ def write_bibtex_item(metadata: Commonmeta) -> dict:
booktitle = (
container.get("title", None) if _type in ["inbook", "inproceedings"] else None
)
language = metadata.language
language = get_language(metadata.language)
location = (
container.get("location", None)
if _type not in ["article", "phdthesis"]
Expand Down
4 changes: 2 additions & 2 deletions commonmeta/writers/schema_org_writer.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""Schema.org writer for commonmeta-py"""
import orjson as json
from ..utils import to_schema_org_creators, github_as_repo_url
from ..utils import to_schema_org_creators, github_as_repo_url, get_language
from ..base_utils import compact, wrap, presence, parse_attributes
from ..constants import CM_TO_SO_TRANSLATIONS

Expand Down Expand Up @@ -117,7 +117,7 @@ def write_schema_org(metadata):
wrap(metadata.subjects), content="subject", first=False
)
),
"inLanguage": metadata.language,
"inLanguage": get_language(metadata.language, format="name"),
"dateCreated": metadata.date.get("created", None),
"datePublished": metadata.date.get("published", None),
"dateModified": metadata.date.get("updated", None),
Expand Down
26 changes: 13 additions & 13 deletions tests/cassettes/test-schema_org_reader/test_arxiv.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -571,13 +571,13 @@ interactions:
Content-Length:
- '45287'
Date:
- Fri, 27 Sep 2024 16:12:50 GMT
- Sun, 29 Sep 2024 08:44:43 GMT
X-Cache:
- MISS, MISS
- HIT, MISS
X-Served-By:
- cache-lga21969-LGA, cache-fra-etou8220025-FRA
- cache-lga21969-LGA, cache-fra-etou8220148-FRA
X-Timer:
- S1727453570.607760,VS0,VE520
- S1727599483.468512,VS0,VE242
cache-control:
- max-age=3600
content-security-policy:
Expand All @@ -591,7 +591,7 @@ interactions:
via:
- 1.1 google, 1.1 varnish, 1.1 varnish
x-cloud-trace-context:
- 70996ce163e93a14aabf2d2281dd1f0b
- e33c0a667e8f81f037b820dada490d93
x-frame-options:
- SAMEORIGIN
status:
Expand Down Expand Up @@ -621,19 +621,19 @@ interactions:
CF-Cache-Status:
- DYNAMIC
CF-RAY:
- 8c9cc70e4bbac7ed-DUS
- 8caab1660939c7c7-DUS
Connection:
- keep-alive
Content-Encoding:
- gzip
Content-Type:
- application/json;charset=UTF-8
Date:
- Fri, 27 Sep 2024 16:12:50 GMT
- Sun, 29 Sep 2024 08:44:43 GMT
NEL:
- '{"success_fraction":0,"report_to":"cf-nel","max_age":604800}'
Report-To:
- '{"endpoints":[{"url":"https:\/\/a.nel.cloudflare.com\/report\/v4?s=M86uOkY%2BHmKkgERQ8fMD6xKIOolzZO7rwHTlrKNdDl3VbkkVWaA%2BAjDbk4hMH4vr6CuewH8WLobEBEl5NeuLZO0RQXuF7lg32XvHtzf%2B3ZZeMV9Av3nokGMpqrLfLcAkldhhRVA%3D"}],"group":"cf-nel","max_age":604800}'
- '{"endpoints":[{"url":"https:\/\/a.nel.cloudflare.com\/report\/v4?s=kku5xi6i%2Bc6dW2J%2FBAPkqOS334XTsyPig7NsjqCKrx5a1sXehfLWOIJZkORBFHpcyEHoFL1%2BibmBuQ9zpjoU0T6wOjJMbtFYpe%2BbIBcr%2FiBzPJ5BEstT8EUnS8TKe5vUebqdSgQ%3D"}],"group":"cf-nel","max_age":604800}'
Server:
- cloudflare
Strict-Transport-Security:
Expand Down Expand Up @@ -665,7 +665,7 @@ interactions:
response:
body:
string: !!binary |
H4sIAILZ9mYAA7xaWXfiyJL+Kzp+qpr2IgnoMp4ndkSz2OzonnrQkhYJ2kYbiDr13+dLpQSIcve4
H4sIAHwT+WYAA7xaWXfiyJL+Kzp+qpr2IgnoMp4ndkSz2OzonnrQkhYJ2kYbiDr13+dLpQSIcve4
7syZhz5lhSJjjy+CVP+4M7VIu3v5cUfNu5c7SXysPtdq4pMWHGnyKNVF+VGUa5Xq3f1dlPoELKZH
QzxpURRQPY5IyA6D+D+c9gPyTo9XTKCF8TunfcBPTeJG9J2SAAr+9ePqmcn4mHPOLdSCNU3ufn6H
kXZEAleLiHIj7YMXpcMfHi0rZvKNgGiRl8t0NYcJ6BOq03tBSTQXYhgxl/wK5Z6r2aBaNCHumPPn
Expand Down Expand Up @@ -749,9 +749,9 @@ interactions:
Content-Type:
- application/json; charset=utf-8
Date:
- Fri, 27 Sep 2024 16:12:50 GMT
- Sun, 29 Sep 2024 08:44:44 GMT
ETag:
- W/"2e8303d763630376f4748e71ec92d1f1"
- W/"b0031bc1da359a74e851ba83fb66bf91"
Referrer-Policy:
- strict-origin-when-cross-origin
Server:
Expand All @@ -773,9 +773,9 @@ interactions:
X-Powered-By:
- Phusion Passenger(R) 6.0.23
X-Request-Id:
- 07b0e136-abc9-459c-85be-52d8c912e335
- 446a22b2-942e-43de-b566-ad85bc482022
X-Runtime:
- '0.035136'
- '0.037905'
X-XSS-Protection:
- '0'
status:
Expand Down
32 changes: 15 additions & 17 deletions tests/cassettes/test-schema_org_reader/test_blog_posting.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,15 +21,15 @@ interactions:
Accept-Ranges:
- bytes
Age:
- '285128'
- '431042'
Alt-Svc:
- clear
Connection:
- keep-alive
Content-Length:
- '0'
Date:
- Fri, 27 Sep 2024 16:12:44 GMT
- Sun, 29 Sep 2024 08:44:38 GMT
Ghost-Fastly:
- 'true'
Vary:
Expand All @@ -39,9 +39,9 @@ interactions:
X-Cache-Hits:
- 0, 3, 1
X-Served-By:
- cache-ams2100143-AMS, cache-ams2100143-AMS, cache-fra-eddf8230051-FRA
- cache-ams2100143-AMS, cache-ams2100143-AMS, cache-fra-eddf8230070-FRA
X-Timer:
- S1727453565.865111,VS0,VE2
- S1727599479.935885,VS0,VE2
cache-control:
- public, max-age=31536000
ghost-age:
Expand All @@ -57,7 +57,7 @@ interactions:
via:
- 1.1 varnish, 1.1 varnish, 1.1 varnish
x-request-id:
- 63d87d30-4cca-4c1b-b9ca-ec18e096d97f
- df93d0ff-f07f-46f2-84fe-89538141dd67
status:
code: 301
message: Moved Permanently
Expand Down Expand Up @@ -237,27 +237,27 @@ interactions:
Accept-Ranges:
- bytes
Age:
- '73779'
- '219693'
Alt-Svc:
- clear
Connection:
- keep-alive
Content-Length:
- '8733'
Date:
- Fri, 27 Sep 2024 16:12:44 GMT
- Sun, 29 Sep 2024 08:44:38 GMT
Ghost-Fastly:
- 'true'
Vary:
- Cookie, Accept-Encoding
X-Cache:
- MISS, HIT, HIT
X-Cache-Hits:
- 0, 1, 1
- 0, 1, 0
X-Served-By:
- cache-ams21021-AMS, cache-ams21021-AMS, cache-fra-eddf8230051-FRA
- cache-ams21021-AMS, cache-ams21021-AMS, cache-fra-eddf8230070-FRA
X-Timer:
- S1727453565.882696,VS0,VE2
- S1727599479.956337,VS0,VE1
cache-control:
- public, max-age=0
content-encoding:
Expand All @@ -277,7 +277,7 @@ interactions:
via:
- 1.1 varnish, 1.1 varnish, 1.1 varnish
x-request-id:
- 1abc9be4-6a48-45a2-87e6-4f663b52aabe
- d7f2445e-c3bc-44b0-8881-93c69f1f8dee
status:
code: 200
message: OK
Expand Down Expand Up @@ -305,27 +305,25 @@ interactions:
CF-Cache-Status:
- DYNAMIC
CF-RAY:
- 8c9cc6ecfda41619-DUS
- 8caab1484a7ec7e4-DUS
Connection:
- keep-alive
Content-Encoding:
- gzip
Content-Type:
- application/json;charset=UTF-8
Date:
- Fri, 27 Sep 2024 16:12:44 GMT
- Sun, 29 Sep 2024 08:44:39 GMT
NEL:
- '{"success_fraction":0,"report_to":"cf-nel","max_age":604800}'
Report-To:
- '{"endpoints":[{"url":"https:\/\/a.nel.cloudflare.com\/report\/v4?s=hVehXeHwTwCLua6GHpmjRsOFKzSaRxyZ4%2Bt%2F4meOA9AzsdVPZjpcdSRRCKa0sM%2FlgeV0spfKCSSSR7Hvs2jzKEkwwI7VDHxWorj%2FcHVOvbgwgRYaJBfiyTY1YY0y3Ll7TxPiOPg%3D"}],"group":"cf-nel","max_age":604800}'
- '{"endpoints":[{"url":"https:\/\/a.nel.cloudflare.com\/report\/v4?s=g0i7GSsud%2B%2F7oKI5ItkMzYrs633gTRtHpKCzRUh0bSazP48S8SqbSBiHYqgtQBOgI4UjDkq5Ac8GTfJE9km6sbCm1MCHRhcNeIjGDzcBM8d%2F1eacRDow7QCXvuQVi7aaBLNIa50%3D"}],"group":"cf-nel","max_age":604800}'
Server:
- cloudflare
Strict-Transport-Security:
- max-age=31536000; includeSubDomains; preload
Transfer-Encoding:
- chunked
alt-svc:
- h3=":443"; ma=86400
permissions-policy:
- interest-cohort=(),browsing-topics=()
vary:
Expand Down Expand Up @@ -391,7 +389,7 @@ interactions:
content-type:
- application/json
date:
- Fri, 27 Sep 2024 16:12:45 GMT
- Sun, 29 Sep 2024 08:44:39 GMT
permissions-policy:
- interest-cohort=()
server:
Expand Down
16 changes: 8 additions & 8 deletions tests/cassettes/test-schema_org_reader/test_dataverse.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,19 +23,19 @@ interactions:
CF-Cache-Status:
- DYNAMIC
CF-RAY:
- 8c9cc6fbed6bc81e-DUS
- 8caab1558f04c7fe-DUS
Connection:
- keep-alive
Content-Encoding:
- gzip
Content-Type:
- application/json;charset=UTF-8
Date:
- Fri, 27 Sep 2024 16:12:47 GMT
- Sun, 29 Sep 2024 08:44:41 GMT
NEL:
- '{"success_fraction":0,"report_to":"cf-nel","max_age":604800}'
Report-To:
- '{"endpoints":[{"url":"https:\/\/a.nel.cloudflare.com\/report\/v4?s=b%2FIDuMkDoKWrGahd552RyR5G4qnxKfC16TZsRCkckd40ZrKt%2Fj92UdbXlUfhzw34lbAQuoyydkLpRq0Cuw3y73mkVZLBAKWxUN4f2AahJkMRUmYo7qFEoaVgZ5nD9V1GN0vfGps%3D"}],"group":"cf-nel","max_age":604800}'
- '{"endpoints":[{"url":"https:\/\/a.nel.cloudflare.com\/report\/v4?s=ZI0tWsrmxkJGS9hcBG4w7WW58K%2Fuq%2Fg73Qo4Rz1FGuCbWAzRRTn2eWCN25O42i7SsLjwwl62CtXGXZiGwZI3P1bsPfq7g7%2F8B5N3O6z8MZot3iqzXiB0wBvsYt8B67UeQRKwoEk%3D"}],"group":"cf-nel","max_age":604800}'
Server:
- cloudflare
Strict-Transport-Security:
Expand Down Expand Up @@ -67,7 +67,7 @@ interactions:
response:
body:
string: !!binary |
H4sIAH/Z9mYAA5xXXXPiOhL9Ky5edu/OkNgGJkDV1BaBBMwQyBhig2/dB1sStoj8UZYMMan89235
H4sIAHkT+WYAA5xXXXPiOhL9Ky5edu/OkNgGJkDV1BaBBMwQyBhig2/dB1sStoj8UZYMMan89235
A0gyc+vWvuFWn+6jbql1eG1gV7iN/muD4ka/oalXNz1Nvcb76Dra3bzwuPG1IfKEwBqOKYcvV4iU
epkgXKLA+DtYkpItfTmvgoln29L0zpFiEgm6pSSFkH/+BSmYIGnkCmJ8XEEpcUVcfL02IjeUvIyo
9KZx5DJlTCIiKOLKYqsMouecxZxGvrJM4gjnjArKlWEc8TgVNAuVfxvjwfIPuSsgxmgRpMwkg39I
Expand Down Expand Up @@ -117,9 +117,9 @@ interactions:
Content-Type:
- application/json; charset=utf-8
Date:
- Fri, 27 Sep 2024 16:12:47 GMT
- Sun, 29 Sep 2024 08:44:41 GMT
ETag:
- W/"e9ee585353da9a606624ed35e3d57a2b"
- W/"1e28b954b5d93cbc96d1469a3d00a448"
Referrer-Policy:
- strict-origin-when-cross-origin
Server:
Expand All @@ -141,9 +141,9 @@ interactions:
X-Powered-By:
- Phusion Passenger(R) 6.0.23
X-Request-Id:
- 8020cc24-cdcc-47cb-b1f2-eca9139e604a
- 6cf945bb-0333-4dc9-858d-0bb91e56d28d
X-Runtime:
- '0.028964'
- '0.029253'
X-XSS-Protection:
- '0'
status:
Expand Down
Loading

0 comments on commit 30ebd3d

Please sign in to comment.