diff --git a/.gitignore b/.gitignore index 09da277..96a430d 100644 --- a/.gitignore +++ b/.gitignore @@ -163,4 +163,7 @@ cython_debug/ # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ +# rialto-airflow data/ +.DS_Store + diff --git a/rialto_airflow/harvest/dimensions.py b/rialto_airflow/harvest/dimensions.py index d308b4b..fe69bab 100644 --- a/rialto_airflow/harvest/dimensions.py +++ b/rialto_airflow/harvest/dimensions.py @@ -36,7 +36,8 @@ def dois_from_orcid(orcid): logging.warning("Truncated results for ORCID %s", orcid) for pub in result["publications"]: if pub.get("doi"): - yield pub["doi"] + doi_id = pub["doi"].replace("https://doi.org/", "") + yield doi_id def doi_orcids_pickle(authors_csv, pickle_file, limit=None) -> None: diff --git a/rialto_airflow/harvest/openalex.py b/rialto_airflow/harvest/openalex.py index 269f3d4..30c3da1 100644 --- a/rialto_airflow/harvest/openalex.py +++ b/rialto_airflow/harvest/openalex.py @@ -62,7 +62,8 @@ def dois_from_orcid(orcid: str): # not all publications have DOIs doi = pub.get("doi") if doi: - yield doi + doi_id = doi.replace("https://doi.org/", "") + yield doi_id def works_from_author_id(author_id, limit=None): diff --git a/rialto_airflow/harvest/sul_pub.py b/rialto_airflow/harvest/sul_pub.py index 6d0bd2d..02c3e41 100644 --- a/rialto_airflow/harvest/sul_pub.py +++ b/rialto_airflow/harvest/sul_pub.py @@ -4,7 +4,7 @@ import requests -sul_pub_fields = [ +SUL_PUB_FIELDS = [ "authorship", "title", "abstract", @@ -35,7 +35,7 @@ def sul_pub_csv(csv_file, host, key, since=None, limit=None): with open(csv_file, "w") as csvfile: - writer = csv.DictWriter(csvfile, fieldnames=sul_pub_fields) + writer = csv.DictWriter(csvfile, fieldnames=SUL_PUB_FIELDS) writer.writeheader() for row in harvest(host, key, since, limit): writer.writerow(row) @@ -73,7 +73,7 @@ def harvest(host, key, since, limit): more = False break - pub = {key: record[key] for key in record if key in sul_pub_fields} + pub = {key: record[key] for key in record if key in SUL_PUB_FIELDS} pub["doi"] = extract_doi(record) yield pub @@ -82,5 +82,6 @@ def harvest(host, key, since, limit): def extract_doi(record): for id in record.get("identifier"): if id["type"] == "doi": - return id["id"] + doi_id = id["id"].replace("https://doi.org/", "") + return doi_id return None diff --git a/test/harvest/test_dimensions.py b/test/harvest/test_dimensions.py index f7007bd..9aff083 100644 --- a/test/harvest/test_dimensions.py +++ b/test/harvest/test_dimensions.py @@ -22,6 +22,7 @@ def test_doi_orcids_dict(tmpdir): assert len(doi_orcids) > 0 assert doi_orcids["10.1109/lra.2018.2890209"] == ["0000-0002-0770-2940"] + assert "https://doi.org/" not in list(doi_orcids.keys())[0], "doi is an ID" def test_publications_from_dois(): diff --git a/test/harvest/test_openalex.py b/test/harvest/test_openalex.py index 4b0d1bc..2516876 100644 --- a/test/harvest/test_openalex.py +++ b/test/harvest/test_openalex.py @@ -30,6 +30,7 @@ def test_doi_orcids_pickle(tmp_path): assert len(mapping) > 0 doi = list(mapping.keys())[0] + assert "https://doi.org/" not in doi, "doi is an ID" assert "/" in doi orcids = mapping[doi]