-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrewrite.py
executable file
·81 lines (68 loc) · 2.01 KB
/
rewrite.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
#!/usr/bin/env python3
import re
import datetime
from warcio import ArchiveIterator, WARCWriter
def main():
fh = open("data/SLAC_2022.warc", "wb")
writer = WARCWriter(fh, gzip=False)
for cdx_line in open("data/SLAC.cdx"):
cdx = parse_cdx_line(cdx_line)
rec = get_record(cdx)
new_rec = rewrite(rec, cdx)
if new_rec:
writer.write_record(new_rec)
fh.close()
def rewrite(rec, cdx):
"""
If the URL and datetime in the CDX entry are different from what is
found in the WARC record, rewrite the WARC record to use the new URL
and datetime while recording the old ones using the WARC-Creation-Date
and WARC-Source-URI headers. If no change is needed return None.
"""
if rec.rec_type != "response":
return None
h = rec.rec_headers
updated = False
if cdx["datetime"] != h["WARC-Date"]:
h["WARC-Creation-Date"] = h["WARC-Date"]
h["WARC-Date"] = cdx["datetime"]
updated = True
if cdx["url"] != h["WARC-Target-URI"]:
h["WARC-Source-URI"] = h["WARC-Target-URI"]
h["WARC-Target-URI"] = cdx["url"]
updated = True
if updated:
return rec
else:
return None
def parse_cdx_line(cdx_line):
"""
Parse a CDX line into a JSON object.
"""
cdx_line = cdx_line.strip()
parts = cdx_line.split(' ')
cdx = {
"url": parts[2],
"datetime": parse_datetime(parts[1]),
"offset": int(parts[8]),
"path": f"data/{parts[9]}"
}
return cdx
def get_record(cdx):
"""
Get a WARC record using the CDX entry.
"""
fh = open(cdx["path"], "rb")
fh.seek(cdx["offset"])
warc = ArchiveIterator(fh)
return next(warc)
def parse_datetime(s):
dt = datetime.datetime.strptime(s, "%Y%m%d%H%M%S")
return dt.strftime("%Y-%m-%dT%H:%M:%SZ")
def write_record(rec, path):
fh = open(path, "ab")
writer = WARCWriter(fh, gzip=False)
writer.write_record(rec)
fh.close()
if __name__ == "__main__":
main()