diff --git a/rialto_airflow/harvest/dimensions.py b/rialto_airflow/harvest/dimensions.py index 835e3f9..d308b4b 100644 --- a/rialto_airflow/harvest/dimensions.py +++ b/rialto_airflow/harvest/dimensions.py @@ -66,7 +66,7 @@ def publications_csv(dois, csv_file) -> None: writer.writerow(pub) -def publications_from_dois(dois: list, batch_size=200) -> str: +def publications_from_dois(dois: list, batch_size=200): """ Get the publications metadata for the provided list of DOIs and write as a CSV file. diff --git a/rialto_airflow/harvest/openalex.py b/rialto_airflow/harvest/openalex.py index 4124e10..54ee9bc 100644 --- a/rialto_airflow/harvest/openalex.py +++ b/rialto_airflow/harvest/openalex.py @@ -1,9 +1,9 @@ import csv import logging +import os import pickle import time -from airflow.models import Variable from pyalex import config, Works import requests from ssl import SSLEOFError @@ -12,7 +12,7 @@ from rialto_airflow.utils import invert_dict -config.email = Variable.get("openalex_email") +config.email = os.environ.get("AIRFLOW_VAR_OPENALEX_EMAIL") config.max_retries = 0 config.retry_backoff_factor = 0.1 config.retry_http_codes = [429, 500, 503] diff --git a/test/harvest/test_openalex.py b/test/harvest/test_openalex.py index 9028c22..4b0d1bc 100644 --- a/test/harvest/test_openalex.py +++ b/test/harvest/test_openalex.py @@ -1,6 +1,8 @@ import pickle import re +import pandas + from rialto_airflow.harvest import openalex @@ -34,3 +36,39 @@ def test_doi_orcids_pickle(tmp_path): assert isinstance(orcids, list) assert len(orcids) > 0 assert re.match(r"^\d+-\d+-\d+-\d+$", orcids[0]) + + +def test_publications_from_dois(): + pubs = list( + openalex.publications_from_dois( + ["10.48550/arxiv.1706.03762", "10.1145/3442188.3445922"] + ) + ) + assert len(pubs) == 2 + assert set(openalex.FIELDS) == set(pubs[0].keys()), "All fields accounted for." + assert len(pubs[0].keys()) == 51, "first publication has 51 columns" + assert len(pubs[1].keys()) == 51, "second publication has 51 columns" + + +def test_publications_csv(tmp_path): + pubs_csv = tmp_path / "openalex-pubs.csv" + openalex.publications_csv( + ["10.48550/arxiv.1706.03762", "10.1145/3442188.3445922"], pubs_csv + ) + + df = pandas.read_csv(pubs_csv) + + assert len(df) == 2 + + # the order of the results isn't guaranteed but make sure things are coming back + + assert set(df.title.tolist()) == set( + ["On the Dangers of Stochastic Parrots", "Attention Is All You Need"] + ) + + assert set(df.doi.tolist()) == set( + [ + "https://doi.org/10.48550/arxiv.1706.03762", + "https://doi.org/10.1145/3442188.3445922", + ] + )