Skip to content

Commit

Permalink
feat: dataset preparations
Browse files Browse the repository at this point in the history
  • Loading branch information
AshishKumar4 committed Aug 3, 2024
1 parent 685e844 commit d1274be
Show file tree
Hide file tree
Showing 6 changed files with 1,638 additions and 21 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,5 @@ gcs_mount
datacache
*.deb
gcsfuse.yml
*.csv
*.tsv
4 changes: 2 additions & 2 deletions datasets/cc12m downloader.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/bin/bash

img2dataset --url_list ./datacache/cc12m.csv --input_format "csv"\
img2dataset --url_list ./cc12m.tsv --input_format "tsv"\
--url_col "image_url" --caption_col "caption" --output_format arrayrecord\
--output_folder gs://flaxdiff-datasets-regional/arrayrecord/cc12m --processes_count 64
--output_folder gs://flaxdiff-datasets-regional/arrayrecord/cc12m --processes_count 64\
--thread_count 64 --image_size 256\
--enable_wandb True --disallowed_header_directives '[]' --compute_hash None --max_shard_retry 3 --timeout 60
1,607 changes: 1,607 additions & 0 deletions datasets/dataset preparations.ipynb

Large diffs are not rendered by default.

7 changes: 7 additions & 0 deletions datasets/laion_a+coco17.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#!/bin/bash

img2dataset --url_list $HOME/research/laion-aesthetics-12m+mscoco-2017.parquet --input_format "parquet"\
--url_col "url" --caption_col "caption" --output_format arrayrecord\
--output_folder gs://flaxdiff-datasets-regional/arrayrecord/laion-aesthetics-12m+mscoco-2017 --processes_count 64\
--thread_count 64 --image_size 256 --min_image_size 100 \
--enable_wandb True --disallowed_header_directives '[]' --compute_hash None --max_shard_retry 3 --timeout 60
File renamed without changes.
39 changes: 20 additions & 19 deletions tpu_utils/setup_tpu.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,23 +14,6 @@ sudo apt update
sudo apt install -y knot-resolver
sudo sh -c 'echo `hostname -I` `hostname` >> /etc/hosts'
sudo sh -c 'echo nameserver 127.0.0.1 > /etc/resolv.conf'
sudo systemctl stop systemd-resolved
sudo systemctl start [email protected]
sudo systemctl start [email protected]
sudo systemctl start [email protected]
sudo systemctl start [email protected]
sudo systemctl start [email protected]
sudo systemctl start [email protected]
sudo systemctl start [email protected]
sudo systemctl start [email protected]
sudo systemctl start [email protected]
sudo systemctl start [email protected]
sudo systemctl start [email protected]
sudo systemctl start [email protected]
sudo systemctl start [email protected]
sudo systemctl start [email protected]
sudo systemctl start [email protected]
sudo systemctl start [email protected]

# Backup the original resolv.conf
sudo cp /etc/resolv.conf /etc/resolv.conf.bak
Expand All @@ -56,6 +39,24 @@ for ns in "${nameservers[@]}"; do
done
echo "Nameservers added to /etc/resolv.conf"

sudo systemctl stop systemd-resolved
sudo systemctl start [email protected]
sudo systemctl start [email protected]
sudo systemctl start [email protected]
sudo systemctl start [email protected]
sudo systemctl start [email protected]
sudo systemctl start [email protected]
sudo systemctl start [email protected]
sudo systemctl start [email protected]
sudo systemctl start [email protected]
sudo systemctl start [email protected]
sudo systemctl start [email protected]
sudo systemctl start [email protected]
sudo systemctl start [email protected]
sudo systemctl start [email protected]
sudo systemctl start [email protected]
sudo systemctl start [email protected]

# Installing and setting up gcsfuse
export GCSFUSE_REPO=gcsfuse-`lsb_release -c -s`
echo "deb [signed-by=/usr/share/keyrings/cloud.google.asc] https://packages.cloud.google.com/apt $GCSFUSE_REPO main" | sudo tee /etc/apt/sources.list.d/gcsfuse.list
Expand Down Expand Up @@ -134,9 +135,9 @@ if [ -n "$GCS_BUCKET" ]; then

# Make the script executable
chmod +x $LOCAL_FILE

echo "Mounting GCS bucket: $GCS_BUCKET to $HOME/gcs_mount"
# Run the script with the specified arguments
./$LOCAL_FILE DATASET_GCS_BUCKET=$GCS_BUCKET MOUNT_PATH=/mnt/gcs_mount
./$LOCAL_FILE DATASET_GCS_BUCKET=$GCS_BUCKET MOUNT_PATH=$HOME/gcs_mount
fi

if [ "$DEV_MODE" = true ]; then
Expand Down

0 comments on commit d1274be

Please sign in to comment.