Skip to content

Commit

Permalink
Paginate the Dimensions works response
Browse files Browse the repository at this point in the history
A batch size of 75 exceeds the page size of 50, which means 25 DOIs will
not be returned unless pagination is used. This commit adds pagination
with a page size of 200 (the max) and modifies the test to make sure
that the pagination is happening.
  • Loading branch information
edsu committed Jun 24, 2024
1 parent 485e4bb commit 8692b89
Show file tree
Hide file tree
Showing 3 changed files with 246 additions and 11 deletions.
11 changes: 6 additions & 5 deletions rialto_airflow/harvest/openalex.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,12 +118,13 @@ def publications_from_dois(dois: list, batch_size=75):
Look up works by DOI in batches that fit within OpenAlex request size limits
"""
for doi_batch in batched(dois, batch_size):
doi_list = "|".join([doi for doi in doi_batch])

result = Works().filter(doi=doi_list).get()
# TODO: do we need this to stay within 100,000 requests / day API quota?
time.sleep(1)
for pub in result:
yield normalize_publication(pub)

doi_list = "|".join([doi for doi in doi_batch])
for page in Works().filter(doi=doi_list).paginate(per_page=200):
for pub in page:
yield normalize_publication(pub)


def normalize_publication(pub) -> dict:
Expand Down
232 changes: 232 additions & 0 deletions test/data/openalex-dois.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,232 @@
doi
10.1002/adma.202103646
10.1001/jamacardio.2021.6059
10.3389/fimmu.2022.832501
10.1161/strokeaha.122.040540
10.1001/jamainternmed.2023.2561
10.1148/radiol.220882
10.1016/s0140-6736(24)01019-5
10.1073/pnas.2121609119
10.1038/s41524-022-00846-z
10.1016/j.ssresearch.2023.102873
10.1021/jacs.2c13551
10.1245/s10434-023-13531-2
10.1089/dia.2023.0405
10.3390/antiox11112189
10.1158/0008-5472.can-22-0554
10.3390/cancers15092564
10.1177/0272989x241232967
10.1182/bloodadvances.2024012637
10.1136/bmjresp-2022-001268
10.1021/acs.jpcc.2c02381
10.3847/1538-4357/acc85c
10.1161/strokeaha.122.039182
10.1063/5.0113079
10.1007/s11912-022-01192-5
10.1182/blood-2023-184531
10.1093/mnras/stac1551
10.1016/j.xfre.2023.12.001
10.1161/str.55.suppl_1.tmp91
10.48550/arxiv.2402.17113
10.1101/2024.03.12.24303785
10.26434/chemrxiv-2024-h0k05
10.3847/1538-4357/ad41de
10.1016/j.crmeth.2024.100779
10.1111/pan.14514
10.1080/21645515.2022.2072138
10.1101/2022.07.08.499233
10.1007/978-3-031-19772-7_19
10.1088/1367-2630/acc201
10.1093/jpids/piad057
10.1145/3581784.3607033
10.1182/blood-2023-190734
10.1053/j.ajkd.2023.04.005
10.1007/978-1-0716-2986-4_4
10.1002/hon.3163_151
10.2337/db23-1792-pub
10.1101/2023.08.22.23294452
10.1182/blood-2023-186853
10.1182/blood-2023-178257
10.1145/3572848.3577515
10.1117/12.2649314
10.48550/arxiv.2305.05786
10.1093/humrep/dead093.724
10.1111/jon.13144
10.1136/jnis-2023-snis.3
10.1161/strokeaha.123.044058
10.48550/arxiv.2311.16933
10.1681/asn.20233411s167a
10.1681/asn.20233411s1901b
10.1161/str.45.suppl_1.wp175
10.48550/arxiv.2202.12311
10.2337/db22-262-or
10.1101/2022.10.17.512618
10.1161/str.47.suppl_1.tp50
10.1161/str.46.suppl_1.tmp50
10.1161/str.44.suppl_1.awmp11
10.48550/arxiv.2203.15809
10.1007/s11695-022-06092-y
10.26226/m.6275705e66d5dcf63a31173d
10.1200/jco.2022.40.16_suppl.7553
10.1016/j.hfh.2022.100020
10.48550/arxiv.2207.14349
10.48550/arxiv.2208.09132
10.1353/hub.2017.0050
10.21203/rs.3.rs-2219318/v1
10.2196/preprints.43036
10.1038/s41587-019-0114-2
10.1007/978-1-4939-7493-1_12
10.1021/jacs.6b09575
10.1103/physrevb.89.115114
10.1182/blood-2016-05-718528
10.1182/blood-2017-10-811224
10.1021/acsenergylett.8b01441
10.1086/308723
10.1161/strokeaha.109.577650
10.1145/1133255.1134018
10.1109/wacv.2019.00156
10.1016/s2352-3026(20)30221-0
10.1158/2159-8290.cd-20-0282
10.1177/0272989x18754513
10.18632/oncotarget.24310
10.1109/tmi.2020.2974159
10.1148/radiol.2021203651
10.1109/tcyb.2020.3016953
10.1182/blood.v128.22.181.181
10.1007/s00268-019-05118-4
10.1126/sciadv.abb2210
10.1016/s2468-2667(21)00162-6
10.1145/3394486.3403142
10.1161/strokeaha.121.034444
10.1177/17474930211065635
10.1609/aaai.v33i01.33011085
10.1039/d0fd00115e
10.1053/j.gastro.2020.05.100
10.1029/2021wr030352
10.1111/pedi.12939
10.1161/jaha.121.022880
10.1002/ima.22423
10.4324/9781003064350-3
10.1111/pedi.12903
10.1103/physrevd.101.043011
10.1634/theoncologist.2020-0040
10.1182/blood-2020-134798
10.1101/2021.08.02.21261516
10.4300/jgme-d-19-00556.1
10.1182/blood.v120.21.153.153
10.1007/s00464-019-07012-5
10.1182/blood.v124.21.4434.4434
10.1139/cjfas-2019-0445
10.1200/jco.2021.39.15_suppl.3006
10.1016/j.soard.2021.07.019
10.1182/blood-2018-99-118828
10.1182/blood-2018-99-119974
10.48550/arxiv.1907.12727
10.1016/j.ccl.2019.07.013
10.1145/3472749.3474809
10.1182/blood-2021-145124
10.1182/blood.v106.11.3344.3344
10.1182/blood-2019-131761
10.1182/blood.v126.23.1536.1536
10.1161/str.44.suppl_1.a179
10.1016/j.jstrokecerebrovasdis.2020.104820
10.1161/str.51.suppl_1.tp137
10.1182/blood.v108.11.813.813
10.1161/str.47.suppl_1.wp29
10.1161/str.43.suppl_1.a2546
10.1136/heartjnl-2014-307109.254
10.48550/arxiv.1808.02883
10.5744/fa.2021.0011
10.1182/blood.v110.11.303.303
10.1182/blood-2019-131532
10.1161/str.51.suppl_1.wp15
10.1200/jco.2020.38.15_suppl.9012
10.37526/1526-744x.2020.47.4.343
10.1145/3259006
10.2139/ssrn.3479447
10.1007/978-3-319-74365-3_215-1
10.1161/str.52.suppl_1.p321
10.1161/str.52.suppl_1.p513
10.1002/adhm.202170038
10.1016/s2152-2650(21)01559-7
10.1038/nmat4465
10.1016/s0140-6736(10)60491-6
10.1001/jama.2010.1862
10.1016/s1470-2045(15)00533-1
10.1002/ana.21632
10.1182/blood-2015-10-673145
10.1182/blood-2013-03-491514
10.1145/1073204.1073268
10.1046/j.1365-8711.2002.05206.x
10.1213/00000539-199208000-00023
10.1210/en.2009-0923
10.1021/jz5020532
10.1042/cs20140059
10.3109/10428194.2012.742521
10.1038/tp.2017.12
10.1063/1.5041381
10.1046/j.1365-8711.2000.03621.x
10.1136/neurintsurg-2013-010973
10.1093/cercor/bhs165
10.2217/nnm.13.3
10.1021/acs.inorgchem.8b00902
10.1021/acs.jpclett.9b00475
10.1161/01.str.28.7.1501
10.1182/blood-2004-10-3820
10.1063/1.2956679
10.1145/1926385.1926407
10.3847/0067-0049/227/1/3
10.1001/jamainternmed.2019.0299
10.1016/s1384-1076(01)00065-3
10.1093/mnras/sts288
10.1145/2555243.2555258
10.1093/mnras/stu2091
10.1093/mnras/stw615
10.1056/nejmc1803856
10.1002/hon.57_2629
10.1086/368148
10.1182/bloodadvances.2019032136
10.1145/1941553.1941558
10.1212/wnl.0b013e3182a08f07
10.1080/13607863.2018.1455167
10.1007/s10562-018-2542-x
10.1145/2499368.2451150
10.1007/s10909-018-1935-y
10.3390/rs11141701
10.1016/j.bpsc.2019.02.003
10.1002/ana.410320319
10.1021/acs.jpclett.7b01549
10.1117/12.2232912
10.1002/ajpa.23267
10.1017/s0950268818000857
10.1093/pm/pnx061
10.1007/s10815-015-0525-z
10.1177/1747493019851282
10.1145/543552.512531
10.48550/arxiv.1411.0050
10.1200/jco.2016.34.15_suppl.9000
10.1002/hon.2438_115
10.1016/j.fertnstert.2018.02.032
10.1109/iccee.2008.48
10.1063/1.2905657
10.1176/appi.ajp.2008.08050764
10.1161/str.50.suppl_1.177
10.1016/s0016-5085(19)36826-x
10.1101/373886
10.1136/neurintsurg-2016-012589.57
10.1142/9789814623995_0379
10.1016/j.jval.2017.08.2450
10.1063/1.2905673
10.1142/9789814293792_0034
10.1016/j.juro.2018.02.566
10.1117/12.2313927
10.1016/j.ijrobp.2018.07.077
10.1016/j.jamcollsurg.2018.07.200
10.1161/str.50.suppl_1.wmp115
10.2337/db19-2398-pub
10.1161/01.str.23.6.912
10.1016/j.hoc.2013.10.004
10.1002/adma.201600213
10.1016/j.fertnstert.2005.07.1044
10.1055/s-0034-1396254
10.1016/b978-0-7216-0081-9.50029-7
14 changes: 8 additions & 6 deletions test/harvest/test_openalex.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,12 +39,14 @@ def test_doi_orcids_pickle(tmp_path):


def test_publications_from_dois():
pubs = list(
openalex.publications_from_dois(
["10.48550/arxiv.1706.03762", "10.1145/3442188.3445922"]
)
)
assert len(pubs) == 2
# get 231 dois that we know are in openalex
dois = pandas.read_csv("test/data/openalex-dois.csv").doi.to_list()
assert len(dois) == 231

# look up the publication metadata for them
pubs = list(openalex.publications_from_dois(dois))
assert len(pubs) == 231, "should paginate (page size=200)"
assert len(pubs) == len(set([pub["doi"] for pub in pubs])), "DOIs are unique"
assert set(openalex.FIELDS) == set(pubs[0].keys()), "All fields accounted for."
assert len(pubs[0].keys()) == 51, "first publication has 51 columns"
assert len(pubs[1].keys()) == 51, "second publication has 51 columns"
Expand Down

0 comments on commit 8692b89

Please sign in to comment.