-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocess.py
166 lines (130 loc) · 6.41 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
# download files from huggingface to a local directory
import os
import shutil
import tarfile
import tempfile
import time
from huggingface_hub import hf_hub_download, HfApi, login
from tqdm import tqdm
def delete_cache_files(local_directory, repo_name):
lock_file = os.path.join(local_directory, ".locks")
dataset_cache = os.path.join(local_directory, f"datasets--{repo_name.replace('/', '--')}")
if os.path.exists(lock_file):
try:
shutil.rmtree(lock_file)
print(f"Deleted {lock_file}")
except:
print(f"Could not delete {lock_file}")
if os.path.exists(dataset_cache):
try:
shutil.rmtree(dataset_cache)
print(f"Deleted {dataset_cache}")
except:
print(f"Could not delete {dataset_cache}")
def tar_batch_and_push_to_huggingface(local_directory, repo_name, batch_size=25000, type_ = 'images', skip = -1):
# Login to Hugging Face
# Initialize Hugging Face API
api = HfApi()
# Get all image files from the directory
if type_ == 'images':
image_files = [f for f in os.listdir(local_directory) if f.lower().endswith(('.png', '.jpg','.jpeg'))]
elif type_ == 'numpy':
image_files = [f for f in os.listdir(local_directory) if f.lower().endswith('.npy')]
elif type_ == 'folder':
batch_size = 1
image_files = [f for f in os.listdir(local_directory) if os.path.isdir(os.path.join(local_directory, f))]
# Process and upload in batches
for i in range(0, len(image_files), batch_size):
batch = image_files[i:i+batch_size]
batch_number = i // batch_size + 1
if batch_number <= skip:
continue
with tempfile.TemporaryDirectory() as temp_dir:
if type_ == 'folder':
tar_filename = f"{batch[0]}.tar.gz"
tar_path = os.path.join(temp_dir, tar_filename)
print(f"Loading embedding to {tar_path}")
numpy_files = [f for f in os.listdir(os.path.join(local_directory, batch[0])) if f.lower().endswith('.npy')]
with tarfile.open(tar_path, "w:gz") as tar:
for file in tqdm(numpy_files, desc=f"Adding files to tar (batch {batch_number})"):
file_path = os.path.join(local_directory, batch[0], file)
tar.add(file_path, arcname=file)
else:
# Create a temporary directory for this batch
tar_filename = f"batch_{batch_number}.tar.gz"
tar_path = os.path.join(temp_dir, tar_filename)
print(f"Creating tar file for batch {batch_number}: {tar_path}")
with tarfile.open(tar_path, "w:gz") as tar:
for file in tqdm(batch, desc=f"Adding files to tar (batch {batch_number})"):
file_path = os.path.join(local_directory, file)
tar.add(file_path, arcname=file)
print(f"Uploading batch {batch_number} to Hugging Face...")
# Upload tar file to Hugging Face
api.upload_file(
path_or_fileobj=tar_path,
path_in_repo=tar_filename,
repo_id=repo_name,
repo_type="dataset",
)
print(f"Batch {batch_number} upload complete. Tar file available at: https://huggingface.co/datasets/{repo_name}/resolve/main/{tar_filename}")
print("All batches have been uploaded to Hugging Face.")
def download_and_extract_batches(repo_name, local_directory, folder_name = "images", keep_folder = False, ending='.tar.gz'):
# Initialize Hugging Face API
start = time.time()
api = HfApi()
end = time.time()
print(f"Initialized Hugging Face API in {end - start:.2f} seconds.")
# Create the local directory if it doesn't exist
os.makedirs(local_directory, exist_ok=True)
images_directory = os.path.join(local_directory, folder_name)
os.makedirs(images_directory, exist_ok=True)
print(images_directory)
# Get the list of files in the repository
files = api.list_repo_files(repo_id=repo_name, repo_type="dataset")
# Filter for tar.gz files
print(f"Found {len(files)} files in the repository.")
print(files)
tar_files = [f for f in files if f.endswith(ending)]
parquet_files = [f for f in files if f.endswith('.parquet')]
# Something is wrong while downloading parquet files (can not save to local directory)
for parquet_file in tqdm(parquet_files, desc="Downloading parquet files"):
# Download the parquet file
parquet_path = hf_hub_download(
repo_id=repo_name,
filename=parquet_file,
repo_type="dataset",
cache_dir=local_directory
)
print(f"Downloaded {parquet_file} to {local_directory}")
# Test copy
shutil.copy(parquet_path, os.path.join(local_directory, 'captions.parquet'))
print(f"Copied {parquet_file} to {local_directory}")
for tar_file in tqdm(tar_files, desc="Downloading and extracting batches"):
# Create a temporary directory for this batch
with tempfile.TemporaryDirectory() as temp_dir:
# Download the tar file
tar_path = hf_hub_download(
repo_id=repo_name,
filename=tar_file,
repo_type="dataset",
cache_dir=temp_dir
)
# Extract the tar file
if keep_folder:
folder_name = tar_file.split('.')[0]
images_folder = os.path.join(images_directory, folder_name)
os.makedirs(images_folder, exist_ok=True)
with tarfile.open(tar_path, 'r:gz') as tar:
tar.extractall(path=images_folder)
else:
with tarfile.open(tar_path, 'r:gz') as tar:
tar.extractall(path=images_directory)
print(f"Extracted {tar_file} to {images_directory}")
print("All batches have been downloaded and extracted.")
delete_cache_files(local_directory, repo_name)
# Need another solution for downloading large files from huggingface
# Usage example
if __name__ == '__main__':
repo_name = "hung20gg/your-dataset-name"
local_directory = "../data/sample"
download_and_extract_batches(repo_name, local_directory)