diff --git a/download_dataset.py b/download_dataset.py index fec082b..a744d2c 100644 --- a/download_dataset.py +++ b/download_dataset.py @@ -4,9 +4,7 @@ from tqdm import tqdm subdir = 'data' -if not os.path.exists(subdir): - os.makedirs(subdir) -subdir = subdir.replace('\\','/') # needed for Windows +os.makedirs(subdir, exist_ok=True) for ds in [ 'webtext', @@ -18,12 +16,19 @@ for split in ['train', 'valid', 'test']: filename = ds + "." + split + '.jsonl' r = requests.get("https://openaipublic.azureedge.net/gpt-2/output-dataset/v1/" + filename, stream=True) + r.raise_for_status() + file_size = int(r.headers["content-length"]) + filepath = os.path.join(subdir, filename) + try: + if os.stat(filepath).st_size == file_size: + print('%s already exists and is the expected %d bytes, not redownloading' % (filepath, file_size)) + r.close() + continue + except OSError: # likely "file not found" or similar + pass - with open(os.path.join(subdir, filename), 'wb') as f: - file_size = int(r.headers["content-length"]) - chunk_size = 1000 + with open(filepath, 'wb') as f: with tqdm(ncols=100, desc="Fetching " + filename, total=file_size, unit_scale=True) as pbar: - # 1k for chunk_size, since Ethernet packet size is around 1500 bytes - for chunk in r.iter_content(chunk_size=chunk_size): + for chunk in r.iter_content(chunk_size=4194304): f.write(chunk) - pbar.update(chunk_size) + pbar.update(len(chunk))