Skip to content

Commit

Permalink
Use pyalex
Browse files Browse the repository at this point in the history
Convert other functions to using pyalex for consistency.
  • Loading branch information
edsu committed Jun 24, 2024
1 parent 485e4bb commit 2eaf184
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 55 deletions.
71 changes: 20 additions & 51 deletions rialto_airflow/harvest/openalex.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,7 @@
import pickle
import time

from pyalex import config, Works
import requests
from pyalex import config, Works, Authors
from ssl import SSLEOFError
from tenacity import retry, retry_if_exception_type, stop_after_delay, wait_random
from more_itertools import batched
Expand Down Expand Up @@ -42,7 +41,7 @@ def doi_orcids_pickle(authors_csv, pickle_file, limit=None):
stop=stop_after_delay(60),
retry=retry_if_exception_type(SSLEOFError),
)
def dois_from_orcid(orcid: str):
def dois_from_orcid(orcid: str, limit=None):
"""
Pass in the ORCID ID and get back an iterator of DOIs for publications authored by that person.
"""
Expand All @@ -52,54 +51,24 @@ def dois_from_orcid(orcid: str):

logging.info(f"looking up dois for orcid {orcid}")

orcid = f"https://orcid.org/{orcid}"
author_resp = requests.get(
f"https://api.openalex.org/authors/{orcid}", allow_redirects=True
)
if author_resp.status_code == 200:
author_id = author_resp.json()["id"].replace("https://openalex.org/", "")
for pub in works_from_author_id(author_id):
# not all publications have DOIs
doi = pub.get("doi")
if doi:
yield doi


def works_from_author_id(author_id, limit=None):
"""
Pass in the OpenAlex Author ID and get back an iterator of works.
"""
url = "https://api.openalex.org/works"
params = {"filter": f"author.id:{author_id}", "per_page": 200}

count = 0
page = 0
has_more = True
while has_more:
page += 1
params["page"] = page

logging.info(f"fetching works for {author_id} page={page}")
resp = requests.get(url, params)

if resp.status_code == 200:
# TODO: get a key so we don't have to sleep!
time.sleep(1)
results = resp.json().get("results")
if len(results) == 0:
has_more = False
else:
for result in results:
count += 1
if limit is not None and count > limit:
has_more = False
else:
yield result
else:
logging.error(
f"encountered HTTP {resp.status_code} response from {url} {params}: {resp.text}"
)
has_more = False
# get the first (and hopefully only) openalex id for the orcid
authors = Authors().filter(orcid=orcid).get()
if len(authors) == 0:
return []
elif len(authors) > 1:
logging.warn(f"found more than one openalex author id for {orcid}")
author_id = authors[0]["id"]

# get all the works for the openalex author id
work_count = 0
for page in Works().filter(author={"id": author_id}).paginate(per_page=200):
for pub in page:
if pub.get("doi"):
work_count += 1
if limit is not None and work_count > limit:
return
else:
yield pub["doi"]


def publications_csv(dois: list, csv_file: str) -> None:
Expand Down
8 changes: 4 additions & 4 deletions test/harvest/test_openalex.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,12 @@ def test_dois_from_orcid():
assert len(dois) >= 54


def test_works_from_author_id():
def test_dois_from_orcid_paging():
# the dimensions api returns 200 publications at a time, so ensure that paging is working
# for Akihisa Inoue who has a lot of publications (> 4,000)
works = list(openalex.works_from_author_id("a5008412118", limit=600))
assert len(works) == 600, "paging is limiting to 600 works"
assert len(set([work["id"] for work in works])) == 600, "the works are unique"
dois = list(openalex.dois_from_orcid("0000-0001-6009-8669", limit=600))
assert len(dois) == 600, "paging is limiting to 600 works"
assert len(set(dois)) == 600, "the dois are unique"


def test_doi_orcids_pickle(tmp_path):
Expand Down

0 comments on commit 2eaf184

Please sign in to comment.