diff --git a/ipwb/indexer.py b/ipwb/indexer.py index 66d7cd94..ecb904be 100755 --- a/ipwb/indexer.py +++ b/ipwb/indexer.py @@ -128,13 +128,13 @@ def index_file_at(warc_paths, encryption_key=None, warc_paths_to_remove = [] for warc_path in warc_paths: if is_wacz(warc_path): - warc_paths_to_append.append( - extract_warcs_from_wacz(warc_path)) + warc_paths_to_append += extract_warcs_from_wacz(warc_path) warc_paths_to_remove.append(warc_path) # Manipulate list of WARCs extracted from WACZ - warc_paths.remove(warc_paths_to_remove) - warc_paths += warc_paths_to_append + for ptr in warc_paths_to_remove: + warc_paths.remove(ptr) + warc_paths = warc_paths + warc_paths_to_append cdxj_lines = [] diff --git a/ipwb/util.py b/ipwb/util.py index b52cad9a..98c0f7bf 100644 --- a/ipwb/util.py +++ b/ipwb/util.py @@ -354,14 +354,14 @@ def get_warc_paths_in_wacz(wacz_path): return [w for w in z.namelist() if w.startswith('archive/')] -def extract_warcs_to_disk(warc_paths): +def extract_warcs_to_disk(wacz_path, warc_paths): for warc in warc_paths: - with ZipFile(warc) as z: + with ZipFile(wacz_path) as z: z.extract(warc) def extract_warcs_from_wacz(wacz_path): warc_paths = get_warc_paths_in_wacz(wacz_path) - extract_warcs_to_disk(warc_paths) + extract_warcs_to_disk(wacz_path, warc_paths) return glob.glob('archive/*')