-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathfetch.py
More file actions
115 lines (94 loc) · 4.03 KB
/
fetch.py
File metadata and controls
115 lines (94 loc) · 4.03 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
""" Fetching arXiv papers from public API. """
import urllib.request as libreq
import time
from datetime import date, timedelta
from typing import List, Tuple
import feedparser
from paper import Paper
CATEGORY = "cs.LG"
PAGE_RESULTS = 1000
WAIT_SECONDS = 5
def get_papers(init_date=None, checkpoint=None) -> List[Paper]:
"""
Get a list of arXiv papers released on or after the later of ``init_date`` and
``checkpoint`` minus 7 days. ``checkpoint`` is the last date that the user "caught
up" and rated all papers which were currently available at the time of rating. This
is a bit janky, but it's the cleanest way I've thought of to deal with the fact that
the arXiv API only dates papers by when they were submitted, but not all papers
which are submitted on the same day are released on the same day. The 7 day cushion
accounts for the event where we rate papers which were submitted on a given day, and
sometime after there are other papers released which were submitted on that same
day.
"""
# Construct API query.
query_template = "http://export.arxiv.org/api/query?"
query_template += f"search_query=cat:{CATEGORY}"
query_template += "&sortBy=submittedDate&sortOrder=descending"
query_template += "&start={start}"
query_template += f"&max_results={PAGE_RESULTS}"
# Construct start date for papers.
if checkpoint is None and init_date is None:
start_date = None
elif checkpoint is None and init_date is not None:
start_date = init_date
elif checkpoint is not None and init_date is not None:
start_date = max(init_date, checkpoint - timedelta(days=7))
else:
raise NotImplementedError
# Query API in pages until we get all papers from the desired range.
results_len = None
papers = []
start = 0
finished = False
while not finished:
time.sleep(WAIT_SECONDS)
query = query_template.format(start=start)
with libreq.urlopen(query) as url:
response = url.read()
# Parse API response and check if we are finished paging results. We include a
# check for empty entries list: This sometimes happens when the result set is
# too big (>30,000), which is not officially supported by the arXiv API.
batch_papers, results_len = parse_response(response)
if len(batch_papers) == 0:
if start == results_len:
finished = True
else:
print(f"Received empty results list after {start}/{results_len}. Retrying...")
continue
# Check if we have gotten all papers from the desired range.
for paper in batch_papers:
# Set start date to date of most recent paper, if necessary.
if start_date is None:
start_date = paper.published
init_date = start_date
if paper.published >= start_date:
papers.append(paper)
else:
finished = True
break
start += PAGE_RESULTS
return papers, init_date
def parse_response(response: bytes) -> Tuple[List[Paper], int]:
""" Parse an Atom response from the arXiv API into a list of papers. """
feed = feedparser.parse(response.decode())
# Parse papers.
results_len = feed.feed.opensearch_totalresults
papers = []
for entry in feed.entries:
abs_pos = entry.id.find("abs/")
bare_id = entry.id[abs_pos+4:]
first_dash = entry.updated.find("-")
second_dash = entry.updated.find("-", first_dash + 1)
T_pos = entry.updated.find("T")
year = int(entry.updated[:first_dash])
month = int(entry.updated[first_dash + 1: second_dash])
day = int(entry.updated[second_dash + 1: T_pos])
papers.append(Paper(
identifier=bare_id,
title=entry.title,
authors=[a["name"] for a in entry.authors],
published=date(year, month, day),
abstract=entry.summary,
link=entry.link,
))
return papers, results_len