diff --git a/litstudy/__init__.py b/litstudy/__init__.py index d96ce64..ea41d37 100644 --- a/litstudy/__init__.py +++ b/litstudy/__init__.py @@ -7,6 +7,7 @@ load_ieee_csv, load_ris_file, load_scopus_csv, + load_semanticscholar_json, load_springer_csv, refine_crossref, refine_scopus, @@ -16,6 +17,9 @@ search_dblp, search_scopus, search_semanticscholar, + fastsearch_semanticscholar, + generate_reference_list, + mass_fetch_semanticscholar, ) from .stats import ( compute_year_histogram, @@ -120,6 +124,8 @@ "load_csv", "load_ieee_csv", "load_ris_file", + "load_scopus_csv", + "load_semanticscholar_json", "load_springer_csv", "refine_crossref", "refine_scopus", @@ -129,6 +135,9 @@ "search_dblp", "search_scopus", "search_semanticscholar", + "fastsearch_semanticscholar", + "generate_reference_list", + "mass_fetch_semanticscholar", "Affiliation", "Author", "Document", diff --git a/litstudy/sources/__init__.py b/litstudy/sources/__init__.py index a5eeccc..4c5dff6 100644 --- a/litstudy/sources/__init__.py +++ b/litstudy/sources/__init__.py @@ -1,6 +1,6 @@ from .scopus import search_scopus, refine_scopus, fetch_scopus from .bibtex import load_bibtex -from .semanticscholar import fetch_semanticscholar, search_semanticscholar, refine_semanticscholar +from .semanticscholar import fetch_semanticscholar, search_semanticscholar, fastsearch_semanticscholar, refine_semanticscholar, load_semanticscholar_json, generate_reference_list, mass_fetch_semanticscholar from .crossref import fetch_crossref, refine_crossref, search_crossref from .ieee import load_ieee_csv from .springer import load_springer_csv @@ -19,6 +19,7 @@ "load_ieee_csv", "load_ris_file", "load_scopus_csv", + "load_semanticscholar_json", "load_springer_csv", "refine_crossref", "refine_scopus", @@ -28,4 +29,7 @@ "search_dblp", "search_scopus", "search_semanticscholar", + "fastsearch_semanticscholar", + "generate_reference_list", + "mass_fetch_semanticscholar", ] diff --git a/litstudy/sources/crossref.py b/litstudy/sources/crossref.py index 7ef67b0..0ea8254 100644 --- a/litstudy/sources/crossref.py +++ b/litstudy/sources/crossref.py @@ -44,7 +44,7 @@ def __init__(self, entry): @property def name(self) -> str: - return self.entry["name"] + return self.entry.get("name") def _extract_title(entry): @@ -80,7 +80,7 @@ def publisher(self): @property def language(self): - return self.get("language") + return self.entry.get("language") @property def publication_date(self): diff --git a/litstudy/sources/semanticscholar.py b/litstudy/sources/semanticscholar.py index 0ab60bd..82afdea 100644 --- a/litstudy/sources/semanticscholar.py +++ b/litstudy/sources/semanticscholar.py @@ -4,6 +4,8 @@ import logging import requests import shelve +from ..common import robust_open +import json from ..common import progress_bar from ..types import Document, Author, DocumentSet, DocumentIdentifier @@ -18,6 +20,7 @@ def extract_id(item): doi=item.get("doi"), arxivid=item.get("arxivId"), s2id=item.get("paperId"), + pubmed=item.get("pubmed"), ) @@ -96,26 +99,34 @@ def load(id): DEFAULT_TIMEOUT = 3.05 # 100 requests per 5 minutes -def request_query(query, offset, limit, cache, session, timeout=DEFAULT_TIMEOUT): - params = urlencode(dict(query=query, offset=offset, limit=limit)) - url = f"{S2_QUERY_URL}?{params}" +def request_query(query, offset, limit, cache, session, timeout=DEFAULT_TIMEOUT, extraParams=dict()): + params=dict(query=query, offset=offset, limit=limit) + params.update(extraParams) + encparams = urlencode(params) + url = f"{S2_QUERY_URL}?{encparams}" if url in cache: return cache[url] + sleep(timeout) - reply = session.get(url) + reply = session.get(url,timeout=60*10) response = reply.json() if "data" not in response: msg = response.get("error") or response.get("message") or "unknown" - raise Exception(f"error while fetching {reply.url}: {msg}") + if msg.find("Too Many Requests.")>-1 or msg.find("Endpoint request timed out")>-1: + logging.info(f"request_query: Timeout error while fetching {reply.url}: {msg}") + return "TIMEOUT" + else: + raise Exception(f"error while fetching {reply.url}: {msg}") cache[url] = response return response -def request_paper(key, cache, session, timeout=DEFAULT_TIMEOUT): - url = S2_PAPER_URL + quote_plus(key) +def request_paper(key, cache, session, timeout=DEFAULT_TIMEOUT, extraParams=dict()): + encparams = urlencode(extraParams) + url = S2_PAPER_URL + quote_plus(key)+"?"+encparams if url in cache: return cache[url] @@ -224,6 +235,7 @@ def search_semanticscholar( with shelve.open(CACHE_FILE) as cache: paper_ids = [] + to=0 while True: offset = len(paper_ids) @@ -232,6 +244,15 @@ def search_semanticscholar( if not response: break + #retry in case of timeout + if response == "TIMEOUT": + to=to+1 + logging.info("Timeout:",DEFAULT_TIMEOUT*4*to) + sleep(DEFAULT_TIMEOUT*4*to) + continue + else: + to=0 + records = response["data"] total = response["total"] @@ -256,3 +277,132 @@ def search_semanticscholar( logging.warn(f"could not find paper id {paper_id}") return DocumentSet(docs) + +def load_semanticscholar_json(path: str) -> DocumentSet: + """Import json file exported from SemanticScholar""" + docs = [] + with robust_open(path) as f: + result = json.load(f) + data=result["data"] + for doc in data: + ids=doc.pop("externalIds") + for i in ids: + if i=="DOI": + doc["doi"]=ids.get("DOI").lower() + elif i=="ArXiv": + doc["arxivId"]=ids.get("ArXiv") + elif i=="PubMed": + doc["pubmed"]=ids.get("PubMed") + docs.append(ScholarDocument(doc)) + return DocumentSet(docs) + +def fastsearch_semanticscholar( + query: str, *, limit: int = 1000, batch_size: int = 100, session=None +) -> DocumentSet: + """Submit the given query to SemanticScholar API and return the results + as a `DocumentSet`. + + :param query: The search query to submit. + :param limit: The maximum number of results to return. Must be at most 1,000 + :param batch_size: The number of results to retrieve per request. Must be at most 100. + :param session: The `requests.Session` to use for HTTP requests. + """ + + if not query: + raise Exception("no query specified in `search_semanticscholar`") + + if session is None: + session = requests.Session() + + docs = [] + + with shelve.open(CACHE_FILE) as cache: + paper_ids = [] + to=0 + while True: + offset = len(docs) + + response = request_query(query, offset, batch_size, cache, session,extraParams={"fields":"title,authors,year,venue,abstract,citations,references,externalIds"}) + if not response: + break + + #Retry in case of timeout + if response == "TIMEOUT": + to=to+1 + logging.info("Timeout:",DEFAULT_TIMEOUT*4*to) + sleep(DEFAULT_TIMEOUT*4*to) + continue + else: + to=0 + + records = response["data"] + total = response["total"] + print("Gesamt:",total,"Offset:",offset) + for record in records: + ids=record.pop("externalIds") + for i in ids: + if i=="DOI": + record["doi"]=ids.get("DOI").lower() + elif i=="ArXiv": + record["arxivId"]=ids.get("ArXiv") + elif i=="PubMed": + record["pubmed"]=ids.get("PubMed") + docs.append(ScholarDocument(record)) + + + # Check if we reached the total number of papers + if len(docs) >= total: + break + + # Check if we exceeded the user-defined limit + if limit is not None and len(docs) >= limit: + docs = docs[:limit] + break + + return DocumentSet(docs) + +def generate_reference_list(docs: DocumentSet): + """Returns a list of referenced documents formattet for a fetch_semanticscholar request. + s2id: + PubMed: PMID: + DOI: DOI: + ArXiv: ARXIV: + """ + references=[] + for u in range(len(docs)): + if docs[u].references == None: + continue + for i in range(len(docs[u].references)): + doi=docs[u].references[i].doi + s2id=docs[u].references[i].s2id + arxivid=docs[u].references[i].arxivid + pubmed=docs[u].references[i].pubmed + title=docs[u].references[i].title + if doi != None: + references.append("DOI:"+doi) + elif s2id != None: + references.append(s2id) + elif pubmed != None: + references.append("PMID:"+pubmed) + elif arxivid != None: + references.append("ARXIV:"+arxivid) + else: + continue + return references + +def mass_fetch_semanticscholar(paper_ids: list, session=None) -> DocumentSet: + if session is None: + session = requests.Session() + #remove duplicates: + paper_ids=list(set(paper_ids)) + + docs = [] + + with shelve.open(CACHE_FILE) as cache: + for paper_id in progress_bar(paper_ids): + record = request_paper(paper_id, cache, session) + if record: + docs.append(ScholarDocument(record)) + else: + logging.warn(f"could not find paper id {paper_id}") + return DocumentSet(docs) \ No newline at end of file diff --git a/litstudy/types.py b/litstudy/types.py index 81f1b25..4337357 100644 --- a/litstudy/types.py +++ b/litstudy/types.py @@ -403,6 +403,8 @@ def matches(self, other: "DocumentIdentifier") -> bool: # Two identifiers match if all keys that they have in common are equal for key in self._attr: + if self._attr[key] == None: + continue if key in other._attr: if self._attr[key] != other._attr[key]: return False