Skip to content

Commit

Permalink
File list of warc appending for #710
Browse files Browse the repository at this point in the history
  • Loading branch information
machawk1 committed May 17, 2022
1 parent 133a9e4 commit 9436999
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 7 deletions.
8 changes: 4 additions & 4 deletions ipwb/indexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,13 +128,13 @@ def index_file_at(warc_paths, encryption_key=None,
warc_paths_to_remove = []
for warc_path in warc_paths:
if is_wacz(warc_path):
warc_paths_to_append.append(
extract_warcs_from_wacz(warc_path))
warc_paths_to_append += extract_warcs_from_wacz(warc_path)
warc_paths_to_remove.append(warc_path)

# Manipulate list of WARCs extracted from WACZ
warc_paths.remove(warc_paths_to_remove)
warc_paths += warc_paths_to_append
for ptr in warc_paths_to_remove:
warc_paths.remove(ptr)
warc_paths = warc_paths + warc_paths_to_append

cdxj_lines = []

Expand Down
6 changes: 3 additions & 3 deletions ipwb/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -354,14 +354,14 @@ def get_warc_paths_in_wacz(wacz_path):
return [w for w in z.namelist() if w.startswith('archive/')]


def extract_warcs_to_disk(warc_paths):
def extract_warcs_to_disk(wacz_path, warc_paths):
for warc in warc_paths:
with ZipFile(warc) as z:
with ZipFile(wacz_path) as z:
z.extract(warc)


def extract_warcs_from_wacz(wacz_path):
warc_paths = get_warc_paths_in_wacz(wacz_path)
extract_warcs_to_disk(warc_paths)
extract_warcs_to_disk(wacz_path, warc_paths)

return glob.glob('archive/*')

0 comments on commit 9436999

Please sign in to comment.