Skip to content

Commit

Permalink
Retry OpenAlex SSL exceptions
Browse files Browse the repository at this point in the history
I noticed that I hit some SSL exceptions when harvesting more data from
OpenAlex (AIRFLOW_VAR_DEV_LIMIT=10000).

```
urllib3.exceptions.MaxRetryError: HTTPSConnectionPool(host='api.openalex.org', port=443): Max retries exceeded with url: /authors/https://orcid.org/0000-0001-5838-5335 (Caused by SSLError(SSLEOFError(8, '[SSL: UNEXPECTED_EOF_WHILE_READING] EOF occurred in violation of protocol (_ssl.c:1000)')))
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
  File "/home/airflow/.local/lib/python3.12/site-packages/airflow/models/taskinstance.py", line 465, in _execute_task
    result = _execute_callable(context=context, **execute_callable_kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/airflow/.local/lib/python3.12/site-packages/airflow/models/taskinstance.py", line 432, in _execute_callable
    return execute_callable(context=context, **execute_callable_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/airflow/.local/lib/python3.12/site-packages/airflow/models/baseoperator.py", line 401, in wrapper
    return func(self, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/airflow/.local/lib/python3.12/site-packages/airflow/decorators/base.py", line 265, in execute
    return_value = super().execute(context)
                   ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/airflow/.local/lib/python3.12/site-packages/airflow/models/baseoperator.py", line 401, in wrapper
    return func(self, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/airflow/.local/lib/python3.12/site-packages/airflow/operators/python.py", line 235, in execute
    return_value = self.execute_callable()
                   ^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/airflow/.local/lib/python3.12/site-packages/airflow/operators/python.py", line 252, in execute_callable
    return self.python_callable(*self.op_args, **self.op_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/airflow/rialto_airflow/dags/harvest.py", line 59, in openalex_harvest_dois
    openalex.doi_orcids_pickle(authors_csv, pickle_file, limit=dev_limit)
  File "/opt/airflow/rialto_airflow/harvest/openalex.py", line 22, in doi_orcids_pickle
    orcid_dois[orcid] = list(dois_from_orcid(orcid))
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/airflow/rialto_airflow/harvest/openalex.py", line 41, in dois_from_orcid
    author_resp = requests.get(
                  ^^^^^^^^^^^^^
  File "/home/airflow/.local/lib/python3.12/site-packages/requests/api.py", line 73, in get
    return request("get", url, params=params, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/airflow/.local/lib/python3.12/site-packages/requests/api.py", line 59, in request
    return session.request(method=method, url=url, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/airflow/.local/lib/python3.12/site-packages/requests/sessions.py", line 589, in request
    resp = self.send(prep, **send_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/airflow/.local/lib/python3.12/site-packages/requests/sessions.py", line 703, in send
    r = adapter.send(request, **kwargs)
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/airflow/.local/lib/python3.12/site-packages/requests/adapters.py", line 698, in send
    raise SSLError(e, request=request)
requests.exceptions.SSLError: HTTPSConnectionPool(host='api.openalex.org', port=443): Max retries exceeded with url: /authors/https://orcid.org/0000-0001-5838-5335 (Caused by SSLError(SSLEOFError(8, '[SSL: UNEXPECTED_EOF_WHILE_READING] EOF occurred in violation of protocol (_ssl.c:1000)')))
[2024-06-24, 11:14:31 UTC] {taskinstance.py:1206} INFO - Marking task as FAILED. dag_id=harvest, task_id=openalex_harvest_dois, run_id=manual__2024-06-24T11:02:02.383856+00:00, execution_date=20240624T110202, start_date=20240624T110205, end_date=20240624T111431
[2024-06-24, 11:14:31 UTC] {standard_task_runner.py:110} ERROR - Failed to execute job 222 for task openalex_harvest_dois (HTTPSConnectionPool(host='api.openalex.org', port=443): Max retries exceeded with url: /authors/https://orcid.org/0000-0001-5838-5335 (Caused by SSLError(SSLEOFError(8, '[SSL: UNEXPECTED_EOF_WHILE_READING] EOF occurred in violation of protocol (_ssl.c:1000)'))); 86)
[2024-06-24, 11:14:31 UTC] {local_task_job_runner.py:240} INFO - Task exited with return code 1
```

This commit uses tenacity to retry these with a random wait between
1-5 seconds, which stops after 60 seconds of trying. We may want to
adjust these based on how well they work. The retry behavior only works
with the SSLError for now so we can get insight into other errors that
we might encounter.
  • Loading branch information
edsu committed Jun 24, 2024
1 parent 6836995 commit 7441c14
Show file tree
Hide file tree
Showing 3 changed files with 14 additions and 4 deletions.
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@ dependencies = [
"pandas",
"requests",
"python-dotenv",
"dimcli"
"dimcli",
"tenacity"
]

[tool.pytest.ini_options]
Expand Down
4 changes: 2 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -102,14 +102,14 @@ sphinxcontrib-serializinghtml==1.1.10
# via sphinx
stack-data==0.6.3
# via ipython
tenacity==8.4.1
# via rialto-airflow (pyproject.toml)
tqdm==4.66.4
# via dimcli
traitlets==5.14.3
# via
# ipython
# matplotlib-inline
typing-extensions==4.12.2
# via ipython
tzdata==2024.1
# via pandas
urllib3==2.2.1
Expand Down
11 changes: 10 additions & 1 deletion rialto_airflow/harvest/openalex.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
import time

import requests
from requests.exceptions import SSLError
from tenacity import retry, retry_if_exception_type, stop_after_delay, wait_random

from rialto_airflow.utils import invert_dict

Expand All @@ -27,6 +29,11 @@ def doi_orcids_pickle(authors_csv, pickle_file, limit=None):
pickle.dump(invert_dict(orcid_dois), handle, protocol=pickle.HIGHEST_PROTOCOL)


@retry(
wait=wait_random(1, 5),
stop=stop_after_delay(60),
retry=retry_if_exception_type(SSLError),
)
def dois_from_orcid(orcid: str):
"""
Pass in the ORCID ID and get back an iterator of DOIs for publications authored by that person.
Expand Down Expand Up @@ -79,5 +86,7 @@ def works_from_author_id(author_id, limit=None):
else:
yield result
else:
logging.error(f"encountered non-200 response: {url} {params}")
logging.error(
f"encountered HTTP {resp.status_code} response from {url} {params}: {resp.text}"
)
has_more = False

0 comments on commit 7441c14

Please sign in to comment.