-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathCORD_temp.R
84 lines (60 loc) · 2.16 KB
/
CORD_temp.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
#One time script to get DOIs of removed records from CORD19
#NB From 2021-11-01 sample dates onwards, initial data collection from PMC includes dois
#Information on CORD19 dataset (inlcluding release dates and download links)
#https://www.semanticscholar.org/cord19/download
#https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/historical_releases.html
#install.packages("tidyverse")
library(tidyverse)
#define functions to import CORD19 data
seeCORD <- function(url){
df <- read_csv(url,
n_max=10)
return(df)
}
getCORDids <- function(url){
df <- read_csv(url,
col_types = cols_only(
pmcid = col_character(),
doi = col_character()
))
return(df)
}
#-------------------------------------------------------
#use dataset from 04_compare_dates.R (one time only)
records_CORD <- records
#Iteratively repeat from here until all dois are found (or earliest CORD version checked)
pmcids <- records_CORD %>%
filter(is.na(doi)) %>%
pull(pmcid)
#import CORD19 dataset
cord19_date <- "2021-11-01"
#cord19_date <- "2021-05-31"
#cord19_date <- "2020-12-12"
#cord19_date <- "2020-10-06"
#cord19_date <- "2020-04-24"
url <- paste0("https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/",
cord19_date,
"/metadata.csv")
#see first rows to check variables
#CORD_see <- seeCORDfull(url)
#get full CORD19 data
CORD_ids <- getCORDids(url)
CORD_select <- CORD_ids %>%
filter(pmcid %in% pmcids) %>%
rename(doi_cord = doi)
#toggle for first and subsequent iterations
CORD_select_all <- CORD_select
CORD_select_all <- c(CORD_select_all, CORD_select)
records_CORD <- records_CORD %>%
left_join(CORD_select) %>%
mutate(doi = case_when(
is.na(doi) ~ doi_cord,
TRUE ~ doi)) %>%
select(-doi_cord)
rm(CORD_ids)
#Repeat iteratively with earlier CORD-dates (matched to PMC sample dates)
#until all dois are found (or earliest CORD version checked)
#Result: 3881 of 4103 DOIs retrieved, 222 not retrieved
#TO DO write into function to call with map
#write retrieved ids to file
write_csv(CORD_select_all, "data/CORD19/CORDids_2021-11-01.csv")