diff --git a/gcsfuse.sh b/gcsfuse.sh new file mode 100755 index 0000000..de6235f --- /dev/null +++ b/gcsfuse.sh @@ -0,0 +1,52 @@ +#!/bin/bash + +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Description: +# bash setup_gcsfuse.sh DATASET_GCS_BUCKET=maxtext-dataset MOUNT_PATH=dataset + +set -e + +# Set environment variables +for ARGUMENT in "$@"; do + IFS='=' read -r KEY VALUE <<< "$ARGUMENT" + export "$KEY"="$VALUE" + echo "$KEY"="$VALUE" +done + +if [[ -z ${DATASET_GCS_BUCKET} || -z ${MOUNT_PATH} ]]; then + echo "Please set arguments: DATASET_GCS_BUCKET and MOUNT_PATH" + exit 1 +fi + +if [[ "$DATASET_GCS_BUCKET" =~ gs:\/\/ ]] ; then + DATASET_GCS_BUCKET="${DATASET_GCS_BUCKET/gs:\/\//}" + echo "Removed gs:// from GCS bucket name, GCS bucket is $DATASET_GCS_BUCKET" +fi + +if [[ -d ${MOUNT_PATH} ]]; then + echo "$MOUNT_PATH exists, removing..." + fusermount -u $MOUNT_PATH || rm -rf $MOUNT_PATH +fi + +mkdir -p $MOUNT_PATH + +# see https://cloud.google.com/storage/docs/gcsfuse-cli for all configurable options of gcsfuse CLI +# Grain uses _PROCESS_MANAGEMENT_MAX_THREADS = 64 (https://github.com/google/grain/blob/main/grain/_src/python/grain_pool.py) +# Please make sure max-conns-per-host > grain_worker_count * _PROCESS_MANAGEMENT_MAX_THREADS + +gcsfuse -o ro --implicit-dirs --http-client-timeout=5s --max-conns-per-host=0 --max-idle-conns-per-host=10000 \ + --experimental-enable-json-read --kernel-list-cache-ttl-secs=-1 -o ro --config-file=$HOME/gcsfuse.yml \ + --log-file=$HOME/gcsfuse.json "$DATASET_GCS_BUCKET" "$MOUNT_PATH" \ No newline at end of file diff --git a/setup_tpu.sh b/setup_tpu.sh index 3c6896f..3afc4ca 100755 --- a/setup_tpu.sh +++ b/setup_tpu.sh @@ -61,7 +61,7 @@ export GCSFUSE_REPO=gcsfuse-`lsb_release -c -s` echo "deb [signed-by=/usr/share/keyrings/cloud.google.asc] https://packages.cloud.google.com/apt $GCSFUSE_REPO main" | sudo tee /etc/apt/sources.list.d/gcsfuse.list curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo tee /usr/share/keyrings/cloud.google.asc sudo apt update -sudo apt install gcsfuse libgl1 +sudo apt install -y gcsfuse libgl1 # Define the file name gcsfuse_conf="$HOME/gcsfuse.yml"