diff --git a/rialto_airflow/harvest/openalex.py b/rialto_airflow/harvest/openalex.py index 269f3d4..afeb7e9 100644 --- a/rialto_airflow/harvest/openalex.py +++ b/rialto_airflow/harvest/openalex.py @@ -118,12 +118,13 @@ def publications_from_dois(dois: list, batch_size=75): Look up works by DOI in batches that fit within OpenAlex request size limits """ for doi_batch in batched(dois, batch_size): - doi_list = "|".join([doi for doi in doi_batch]) - - result = Works().filter(doi=doi_list).get() + # TODO: do we need this to stay within 100,000 requests / day API quota? time.sleep(1) - for pub in result: - yield normalize_publication(pub) + + doi_list = "|".join([doi for doi in doi_batch]) + for page in Works().filter(doi=doi_list).paginate(per_page=200): + for pub in page: + yield normalize_publication(pub) def normalize_publication(pub) -> dict: diff --git a/test/data/openalex-dois.csv b/test/data/openalex-dois.csv new file mode 100644 index 0000000..a56178f --- /dev/null +++ b/test/data/openalex-dois.csv @@ -0,0 +1,232 @@ +doi +10.1002/adma.202103646 +10.1001/jamacardio.2021.6059 +10.3389/fimmu.2022.832501 +10.1161/strokeaha.122.040540 +10.1001/jamainternmed.2023.2561 +10.1148/radiol.220882 +10.1016/s0140-6736(24)01019-5 +10.1073/pnas.2121609119 +10.1038/s41524-022-00846-z +10.1016/j.ssresearch.2023.102873 +10.1021/jacs.2c13551 +10.1245/s10434-023-13531-2 +10.1089/dia.2023.0405 +10.3390/antiox11112189 +10.1158/0008-5472.can-22-0554 +10.3390/cancers15092564 +10.1177/0272989x241232967 +10.1182/bloodadvances.2024012637 +10.1136/bmjresp-2022-001268 +10.1021/acs.jpcc.2c02381 +10.3847/1538-4357/acc85c +10.1161/strokeaha.122.039182 +10.1063/5.0113079 +10.1007/s11912-022-01192-5 +10.1182/blood-2023-184531 +10.1093/mnras/stac1551 +10.1016/j.xfre.2023.12.001 +10.1161/str.55.suppl_1.tmp91 +10.48550/arxiv.2402.17113 +10.1101/2024.03.12.24303785 +10.26434/chemrxiv-2024-h0k05 +10.3847/1538-4357/ad41de +10.1016/j.crmeth.2024.100779 +10.1111/pan.14514 +10.1080/21645515.2022.2072138 +10.1101/2022.07.08.499233 +10.1007/978-3-031-19772-7_19 +10.1088/1367-2630/acc201 +10.1093/jpids/piad057 +10.1145/3581784.3607033 +10.1182/blood-2023-190734 +10.1053/j.ajkd.2023.04.005 +10.1007/978-1-0716-2986-4_4 +10.1002/hon.3163_151 +10.2337/db23-1792-pub +10.1101/2023.08.22.23294452 +10.1182/blood-2023-186853 +10.1182/blood-2023-178257 +10.1145/3572848.3577515 +10.1117/12.2649314 +10.48550/arxiv.2305.05786 +10.1093/humrep/dead093.724 +10.1111/jon.13144 +10.1136/jnis-2023-snis.3 +10.1161/strokeaha.123.044058 +10.48550/arxiv.2311.16933 +10.1681/asn.20233411s167a +10.1681/asn.20233411s1901b +10.1161/str.45.suppl_1.wp175 +10.48550/arxiv.2202.12311 +10.2337/db22-262-or +10.1101/2022.10.17.512618 +10.1161/str.47.suppl_1.tp50 +10.1161/str.46.suppl_1.tmp50 +10.1161/str.44.suppl_1.awmp11 +10.48550/arxiv.2203.15809 +10.1007/s11695-022-06092-y +10.26226/m.6275705e66d5dcf63a31173d +10.1200/jco.2022.40.16_suppl.7553 +10.1016/j.hfh.2022.100020 +10.48550/arxiv.2207.14349 +10.48550/arxiv.2208.09132 +10.1353/hub.2017.0050 +10.21203/rs.3.rs-2219318/v1 +10.2196/preprints.43036 +10.1038/s41587-019-0114-2 +10.1007/978-1-4939-7493-1_12 +10.1021/jacs.6b09575 +10.1103/physrevb.89.115114 +10.1182/blood-2016-05-718528 +10.1182/blood-2017-10-811224 +10.1021/acsenergylett.8b01441 +10.1086/308723 +10.1161/strokeaha.109.577650 +10.1145/1133255.1134018 +10.1109/wacv.2019.00156 +10.1016/s2352-3026(20)30221-0 +10.1158/2159-8290.cd-20-0282 +10.1177/0272989x18754513 +10.18632/oncotarget.24310 +10.1109/tmi.2020.2974159 +10.1148/radiol.2021203651 +10.1109/tcyb.2020.3016953 +10.1182/blood.v128.22.181.181 +10.1007/s00268-019-05118-4 +10.1126/sciadv.abb2210 +10.1016/s2468-2667(21)00162-6 +10.1145/3394486.3403142 +10.1161/strokeaha.121.034444 +10.1177/17474930211065635 +10.1609/aaai.v33i01.33011085 +10.1039/d0fd00115e +10.1053/j.gastro.2020.05.100 +10.1029/2021wr030352 +10.1111/pedi.12939 +10.1161/jaha.121.022880 +10.1002/ima.22423 +10.4324/9781003064350-3 +10.1111/pedi.12903 +10.1103/physrevd.101.043011 +10.1634/theoncologist.2020-0040 +10.1182/blood-2020-134798 +10.1101/2021.08.02.21261516 +10.4300/jgme-d-19-00556.1 +10.1182/blood.v120.21.153.153 +10.1007/s00464-019-07012-5 +10.1182/blood.v124.21.4434.4434 +10.1139/cjfas-2019-0445 +10.1200/jco.2021.39.15_suppl.3006 +10.1016/j.soard.2021.07.019 +10.1182/blood-2018-99-118828 +10.1182/blood-2018-99-119974 +10.48550/arxiv.1907.12727 +10.1016/j.ccl.2019.07.013 +10.1145/3472749.3474809 +10.1182/blood-2021-145124 +10.1182/blood.v106.11.3344.3344 +10.1182/blood-2019-131761 +10.1182/blood.v126.23.1536.1536 +10.1161/str.44.suppl_1.a179 +10.1016/j.jstrokecerebrovasdis.2020.104820 +10.1161/str.51.suppl_1.tp137 +10.1182/blood.v108.11.813.813 +10.1161/str.47.suppl_1.wp29 +10.1161/str.43.suppl_1.a2546 +10.1136/heartjnl-2014-307109.254 +10.48550/arxiv.1808.02883 +10.5744/fa.2021.0011 +10.1182/blood.v110.11.303.303 +10.1182/blood-2019-131532 +10.1161/str.51.suppl_1.wp15 +10.1200/jco.2020.38.15_suppl.9012 +10.37526/1526-744x.2020.47.4.343 +10.1145/3259006 +10.2139/ssrn.3479447 +10.1007/978-3-319-74365-3_215-1 +10.1161/str.52.suppl_1.p321 +10.1161/str.52.suppl_1.p513 +10.1002/adhm.202170038 +10.1016/s2152-2650(21)01559-7 +10.1038/nmat4465 +10.1016/s0140-6736(10)60491-6 +10.1001/jama.2010.1862 +10.1016/s1470-2045(15)00533-1 +10.1002/ana.21632 +10.1182/blood-2015-10-673145 +10.1182/blood-2013-03-491514 +10.1145/1073204.1073268 +10.1046/j.1365-8711.2002.05206.x +10.1213/00000539-199208000-00023 +10.1210/en.2009-0923 +10.1021/jz5020532 +10.1042/cs20140059 +10.3109/10428194.2012.742521 +10.1038/tp.2017.12 +10.1063/1.5041381 +10.1046/j.1365-8711.2000.03621.x +10.1136/neurintsurg-2013-010973 +10.1093/cercor/bhs165 +10.2217/nnm.13.3 +10.1021/acs.inorgchem.8b00902 +10.1021/acs.jpclett.9b00475 +10.1161/01.str.28.7.1501 +10.1182/blood-2004-10-3820 +10.1063/1.2956679 +10.1145/1926385.1926407 +10.3847/0067-0049/227/1/3 +10.1001/jamainternmed.2019.0299 +10.1016/s1384-1076(01)00065-3 +10.1093/mnras/sts288 +10.1145/2555243.2555258 +10.1093/mnras/stu2091 +10.1093/mnras/stw615 +10.1056/nejmc1803856 +10.1002/hon.57_2629 +10.1086/368148 +10.1182/bloodadvances.2019032136 +10.1145/1941553.1941558 +10.1212/wnl.0b013e3182a08f07 +10.1080/13607863.2018.1455167 +10.1007/s10562-018-2542-x +10.1145/2499368.2451150 +10.1007/s10909-018-1935-y +10.3390/rs11141701 +10.1016/j.bpsc.2019.02.003 +10.1002/ana.410320319 +10.1021/acs.jpclett.7b01549 +10.1117/12.2232912 +10.1002/ajpa.23267 +10.1017/s0950268818000857 +10.1093/pm/pnx061 +10.1007/s10815-015-0525-z +10.1177/1747493019851282 +10.1145/543552.512531 +10.48550/arxiv.1411.0050 +10.1200/jco.2016.34.15_suppl.9000 +10.1002/hon.2438_115 +10.1016/j.fertnstert.2018.02.032 +10.1109/iccee.2008.48 +10.1063/1.2905657 +10.1176/appi.ajp.2008.08050764 +10.1161/str.50.suppl_1.177 +10.1016/s0016-5085(19)36826-x +10.1101/373886 +10.1136/neurintsurg-2016-012589.57 +10.1142/9789814623995_0379 +10.1016/j.jval.2017.08.2450 +10.1063/1.2905673 +10.1142/9789814293792_0034 +10.1016/j.juro.2018.02.566 +10.1117/12.2313927 +10.1016/j.ijrobp.2018.07.077 +10.1016/j.jamcollsurg.2018.07.200 +10.1161/str.50.suppl_1.wmp115 +10.2337/db19-2398-pub +10.1161/01.str.23.6.912 +10.1016/j.hoc.2013.10.004 +10.1002/adma.201600213 +10.1016/j.fertnstert.2005.07.1044 +10.1055/s-0034-1396254 +10.1016/b978-0-7216-0081-9.50029-7 diff --git a/test/harvest/test_openalex.py b/test/harvest/test_openalex.py index 4b0d1bc..3ef1a64 100644 --- a/test/harvest/test_openalex.py +++ b/test/harvest/test_openalex.py @@ -39,12 +39,14 @@ def test_doi_orcids_pickle(tmp_path): def test_publications_from_dois(): - pubs = list( - openalex.publications_from_dois( - ["10.48550/arxiv.1706.03762", "10.1145/3442188.3445922"] - ) - ) - assert len(pubs) == 2 + # get 231 dois that we know are in openalex + dois = pandas.read_csv("test/data/openalex-dois.csv").doi.to_list() + assert len(dois) == 231 + + # look up the publication metadata for them + pubs = list(openalex.publications_from_dois(dois)) + assert len(pubs) == 231, "should paginate (page size=200)" + assert len(pubs) == len(set([pub["doi"] for pub in pubs])), "DOIs are unique" assert set(openalex.FIELDS) == set(pubs[0].keys()), "All fields accounted for." assert len(pubs[0].keys()) == 51, "first publication has 51 columns" assert len(pubs[1].keys()) == 51, "second publication has 51 columns"