Skip to content

Commit

Permalink
Don't clobber existing files
Browse files Browse the repository at this point in the history
  • Loading branch information
akx committed Feb 3, 2023
1 parent 4452365 commit b4285b9
Showing 1 changed file with 10 additions and 2 deletions.
12 changes: 10 additions & 2 deletions download_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,17 @@
filename = ds + "." + split + '.jsonl'
r = requests.get("https://openaipublic.azureedge.net/gpt-2/output-dataset/v1/" + filename, stream=True)
r.raise_for_status()
file_size = int(r.headers["content-length"])
filepath = os.path.join(subdir, filename)
try:
if os.stat(filepath).st_size == file_size:
print('%s already exists and is the expected %d bytes, not redownloading' % (filepath, file_size))
r.close()
continue
except OSError: # likely "file not found" or similar
pass

with open(os.path.join(subdir, filename), 'wb') as f:
file_size = int(r.headers["content-length"])
with open(filepath, 'wb') as f:
with tqdm(ncols=100, desc="Fetching " + filename, total=file_size, unit_scale=True) as pbar:
for chunk in r.iter_content(chunk_size=4194304):
f.write(chunk)
Expand Down

0 comments on commit b4285b9

Please sign in to comment.