Skip to content

Commit

Permalink
Adding tests
Browse files Browse the repository at this point in the history
  • Loading branch information
lwrubel committed Jun 24, 2024
1 parent bf0be9e commit 79cf3e1
Show file tree
Hide file tree
Showing 3 changed files with 41 additions and 3 deletions.
2 changes: 1 addition & 1 deletion rialto_airflow/harvest/dimensions.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def publications_csv(dois, csv_file) -> None:
writer.writerow(pub)


def publications_from_dois(dois: list, batch_size=200) -> str:
def publications_from_dois(dois: list, batch_size=200):
"""
Get the publications metadata for the provided list of DOIs and write as a
CSV file.
Expand Down
4 changes: 2 additions & 2 deletions rialto_airflow/harvest/openalex.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import csv
import logging
import os
import pickle
import time

from airflow.models import Variable
from pyalex import config, Works
import requests
from ssl import SSLEOFError
Expand All @@ -12,7 +12,7 @@

from rialto_airflow.utils import invert_dict

config.email = Variable.get("openalex_email")
config.email = os.environ.get("AIRFLOW_VAR_OPENALEX_EMAIL")
config.max_retries = 0
config.retry_backoff_factor = 0.1
config.retry_http_codes = [429, 500, 503]
Expand Down
38 changes: 38 additions & 0 deletions test/harvest/test_openalex.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import pickle
import re

import pandas

from rialto_airflow.harvest import openalex


Expand Down Expand Up @@ -34,3 +36,39 @@ def test_doi_orcids_pickle(tmp_path):
assert isinstance(orcids, list)
assert len(orcids) > 0
assert re.match(r"^\d+-\d+-\d+-\d+$", orcids[0])


def test_publications_from_dois():
pubs = list(
openalex.publications_from_dois(
["10.48550/arxiv.1706.03762", "10.1145/3442188.3445922"]
)
)
assert len(pubs) == 2
assert set(openalex.FIELDS) == set(pubs[0].keys()), "All fields accounted for."
assert len(pubs[0].keys()) == 51, "first publication has 51 columns"
assert len(pubs[1].keys()) == 51, "second publication has 51 columns"


def test_publications_csv(tmp_path):
pubs_csv = tmp_path / "openalex-pubs.csv"
openalex.publications_csv(
["10.48550/arxiv.1706.03762", "10.1145/3442188.3445922"], pubs_csv
)

df = pandas.read_csv(pubs_csv)

assert len(df) == 2

# the order of the results isn't guaranteed but make sure things are coming back

assert set(df.title.tolist()) == set(
["On the Dangers of Stochastic Parrots", "Attention Is All You Need"]
)

assert set(df.doi.tolist()) == set(
[
"https://doi.org/10.48550/arxiv.1706.03762",
"https://doi.org/10.1145/3442188.3445922",
]
)

0 comments on commit 79cf3e1

Please sign in to comment.