diff --git a/docker/problem.yaml b/docker/quokka_deploy.yaml similarity index 71% rename from docker/problem.yaml rename to docker/quokka_deploy.yaml index 999706f..a91aa69 100644 --- a/docker/problem.yaml +++ b/docker/quokka_deploy.yaml @@ -1,5 +1,5 @@ # An unique identifier for the head node and workers of this cluster. -cluster_name: us_west_2_3.8_full_regional_setups +cluster_name: quokka_cluster # The maximum number of workers nodes to launch in addition to the head # node. @@ -17,6 +17,7 @@ upscaling_speed: 4.0 # and opens all the necessary ports to support the Ray cluster. # Empty string means disabled. docker: + #image: "rayproject/ray-ml:latest-gpu" # You can change this to latest-cpu if you don't need GPU support and want a faster startup # image: marsupialtail/quokka-on-docker:1.0 # use this one if you don't need ML dependencies, it's faster to pull image: rayproject/ray:nightly-py38-cpu # use this one if you don't need ML dependencies, it's faster to pull container_name: "ray_container" @@ -26,6 +27,10 @@ docker: run_options: # Extra options to pass into "docker run" - --ulimit nofile=65536:65536 - -p 5005:5005 + - -e AWS_ACCESS_KEY_ID=X + - -e AWS_SECRET_ACCESS_KEY=X + - -e AWS_REGION=us-west-2 + - -e GLIBC_TUNABLES=glibc.malloc.trim_threshold=524288 # Example of running a GPU head with CPU workers # head_image: "rayproject/ray-ml:latest-gpu" @@ -53,7 +58,7 @@ provider: # How Ray will authenticate with newly launched nodes. auth: ssh_user: ubuntu - ssh_private_key: redact.pem + ssh_private_key: X # Tell the autoscaler the allowed node types and the resources they provide. # The key is the name of the node type, which is just for debugging purposes. @@ -78,7 +83,7 @@ available_node_types: # for default images for other zones. ImageId: ami-0530ca8899fac469f # Additional options in the boto docs. - KeyName: redact + KeyName: X SecurityGroupIds: [sg-0f01d7c338d22dfa5] BlockDeviceMappings: - DeviceName: /dev/sda1 @@ -110,7 +115,7 @@ available_node_types: # NOTE: If relying on spot instances, it is best to specify multiple different instance # types to avoid interruption when one instance type is experiencing heightened demand. # Demand information can be found at https://aws.amazon.com/ec2/spot/instance-advisor/ - KeyName: redact + KeyName: X SecurityGroupIds: [sg-0f01d7c338d22dfa5] BlockDeviceMappings: - DeviceName: /dev/sda1 @@ -126,7 +131,25 @@ initialization_commands: - sudo systemctl restart docker -f -setup_commands: [] +setup_commands: + - sudo apt-get update + - sudo apt-get -y install curl + - sudo apt-get -y install unzip + - sudo apt-get -y install python3.8-dev + - pip3 install ray==2.3.0 # this needs to be the SAME version as the ray you have on your client machine, i.e. the one that you will be using to interact with the cluster. + - pip3 install ldbpy # this is for running some Quokka operators that requires custom accelerated computing, mostly for time series, most operators should be Python only. + - pip3 install threadpoolctl # install optional dependencies for your workloads + - pip3 install pyquokka # this uses the latest pypi release of quokka to quickly install all its dependencies. The actual quokka code run is what you have on your client pushed at runtime. + - pip3 install cffi + - sudo apt-get -y install nvme-cli + - curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" + - unzip awscliv2.zip + - sudo ./aws/install + - pip3 install --upgrade awscli + - sudo apt-get -y install python-cffi + - sudo mkdir /data + - sudo chmod -R a+rw /data + - nohup python3 -u $(python3 -c 'import pyquokka; print(pyquokka.__file__.replace("__init__.py", "flight.py"))') > foo.out 2> foo.err < /dev/null & # start flight server inside docker #- /home/ubuntu/.local/bin/ray stop # Note: if you're developing Ray, you probably want to create a Docker image that # has your Ray repo pre-cloned. Then, you can replace the pip installs @@ -136,7 +159,13 @@ setup_commands: [] # - pip install -U "ray[default] @ https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl" # Custom commands that will be run on the head node after common setup. -head_setup_commands: [] +head_setup_commands: + - curl -fsSL https://packages.redis.io/gpg | sudo gpg --dearmor -o /usr/share/keyrings/redis-archive-keyring.gpg + - echo "deb [signed-by=/usr/share/keyrings/redis-archive-keyring.gpg] https://packages.redis.io/deb focal main" | sudo tee /etc/apt/sources.list.d/redis.list + - sudo apt-get update + - sudo apt-get install -y redis + - redis-server $(python3 -c 'import pyquokka; print(pyquokka.__file__.replace("__init__.py", "redis.conf"))') --port 6800 --protected-mode no& + - echo '* hard nofile 65536\n* soft nofile 65536\n* hard core unlimited\n* soft core unlimited' | sudo tee /etc/security/limits.conf # Custom commands that will be run on worker nodes after common setup. worker_setup_commands: [] @@ -145,7 +174,7 @@ worker_setup_commands: [] head_start_ray_commands: - ray stop # - --head --port=6380 --autoscaling-config=~/ray_bootstrap_config.yaml - - ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml + - ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --object-store-memory 5000000000 --system-config='{"automatic_object_spilling_enabled":true,"max_io_workers":4,"object_spilling_config":"{\"type\":\"filesystem\",\"params\":{\"directory_path\":\"/data\"}}"}' # Command to start ray on worker nodes. You don't need to change this. worker_start_ray_commands: diff --git a/docker/us_west_2_3.8.yaml b/docker/us_west_2_3.8.yaml deleted file mode 100644 index 7e852be..0000000 --- a/docker/us_west_2_3.8.yaml +++ /dev/null @@ -1,159 +0,0 @@ -# An unique identifier for the head node and workers of this cluster. -cluster_name: us_west_2_3.8_full_regional_setups - -# The maximum number of workers nodes to launch in addition to the head -# node. -# Note: This number needs to be larger than the total number of min_workers set -max_workers: 10 - -# The autoscaler will scale up the cluster faster with higher upscaling speed. -# E.g., if the task requires adding more nodes then autoscaler will gradually -# scale up the cluster in chunks of upscaling_speed*currently_running_nodes. -# This number should be > 0. -upscaling_speed: 4.0 - - -# This executes all commands on all nodes in the docker container, -# and opens all the necessary ports to support the Ray cluster. -# Empty string means disabled. -docker: - #image: "rayproject/ray-ml:latest-gpu" # You can change this to latest-cpu if you don't need GPU support and want a faster startup - image: marsupialtail/quokka-on-docker:1.0 # use this one if you don't need ML dependencies, it's faster to pull - container_name: "ray_container" - # If true, pulls latest version of image. Otherwise, `docker run` will only pull the image - # if no cached version is present. - pull_before_run: True - run_options: # Extra options to pass into "docker run" - - --ulimit nofile=65536:65536 - - -p 5005:5005 - - # Example of running a GPU head with CPU workers - # head_image: "rayproject/ray-ml:latest-gpu" - # Allow Ray to automatically detect GPUs - - # worker_image: "rayproject/ray-ml:latest-cpu" - # worker_run_options: [] - - -# If a node is idle for this many minutes, it will be removed. -idle_timeout_minutes: 5 - -# Cloud-provider specific configuration. -provider: - type: aws - region: us-west-2 - # Availability zone(s), comma-separated, that nodes may be launched in. - # Nodes will be launched in the first listed availability zone and will - # be tried in the subsequent availability zones if launching fails. - availability_zone: us-west-2a,us-west-2b - # security_group: - # GroupName: sg-0f01d7c338d22dfa5 - - -# How Ray will authenticate with newly launched nodes. -auth: - ssh_user: ubuntu - ssh_private_key: /home/ziheng/Downloads/oregon-neurodb.pem - -# Tell the autoscaler the allowed node types and the resources they provide. -# The key is the name of the node type, which is just for debugging purposes. -# The node config specifies the launch config and physical instance type. -available_node_types: - ray.head.default: - # The node type's CPU and GPU resources are auto-detected based on AWS instance type. - # If desired, you can override the autodetected CPU and GPU resources advertised to the autoscaler. - # You can also set custom resources. - # For example, to mark a node type as having 1 CPU, 1 GPU, and 5 units of a resource called "custom", set - # resources: {"CPU": 1, "GPU": 1, "custom": 5} - resources: {} - min_workers: 0 - max_workers: 5 - # Provider-specific config for this node type, e.g. instance type. By default - # Ray will auto-configure unspecified fields such as SubnetId and KeyName. - # For more documentation on available fields, see: - # http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances - node_config: - InstanceType: i3.2xlarge - # Default AMI for us-west-2. - # Check https://github.com/ray-project/ray/blob/master/python/ray/autoscaler/_private/aws/config.py - # for default images for other zones. - ImageId: ami-0530ca8899fac469f - # Additional options in the boto docs. - KeyName: oregon-neurodb - SecurityGroupIds: [sg-0f01d7c338d22dfa5] - ray.worker.default: - # The minimum number of worker nodes of this type to launch. - # This number should be >= 0. - min_workers: 2 - # The maximum number of worker nodes of this type to launch. - # This takes precedence over min_workers. - max_workers: 2 - # The node type's CPU and GPU resources are auto-detected based on AWS instance type. - # If desired, you can override the autodetected CPU and GPU resources advertised to the autoscaler. - # You can also set custom resources. - # For example, to mark a node type as having 1 CPU, 1 GPU, and 5 units of a resource called "custom", set - # resources: {"CPU": 1, "GPU": 1, "custom": 5} - resources: {} - # Provider-specific config for this node type, e.g. instance type. By default - # Ray will auto-configure unspecified fields such as SubnetId and KeyName. - # For more documentation on available fields, see: - # http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances - node_config: - InstanceType: i3.2xlarge - # Default AMI for us-west-2. - # Check https://github.com/ray-project/ray/blob/master/python/ray/autoscaler/_private/aws/config.py - # for default images for other zones. - ImageId: ami-0530ca8899fac469f - # Run workers on spot by default. Comment this out to use on-demand. - # NOTE: If relying on spot instances, it is best to specify multiple different instance - # types to avoid interruption when one instance type is experiencing heightened demand. - # Demand information can be found at https://aws.amazon.com/ec2/spot/instance-advisor/ - KeyName: oregon-neurodb - SecurityGroupIds: [sg-0f01d7c338d22dfa5] - -head_node_type: ray.head.default - -initialization_commands: - - curl -fsSL https://get.docker.com -o get-docker.sh - - sudo sh get-docker.sh - - sudo usermod -aG docker $USER - - sudo systemctl restart docker -f - - -setup_commands: - - sudo apt-get update - - sudo apt install -y python3-pip - - pip3 install ray==2.0.0 - - pip3 install boto3 - - pip3 install parallel-ssh - #- /home/ubuntu/.local/bin/ray stop - # Note: if you're developing Ray, you probably want to create a Docker image that - # has your Ray repo pre-cloned. Then, you can replace the pip installs - # below with a git checkout (and possibly a recompile). - # To run the nightly version of ray (as opposed to the latest), either use a rayproject docker image - # that has the "nightly" (e.g. "rayproject/ray-ml:nightly-gpu") or uncomment the following line: - # - pip install -U "ray[default] @ https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl" - -# Custom commands that will be run on the head node after common setup. -head_setup_commands: - - sudo apt-get update - - sudo apt install -y python3-pip - - pip3 install ray - -# Custom commands that will be run on worker nodes after common setup. -worker_setup_commands: - - sudo apt-get update - - sudo apt install -y python3-pip - - pip3 install ray - -# Command to start ray on the head node. You don't need to change this. -head_start_ray_commands: - - ray stop -# - --head --port=6380 --autoscaling-config=~/ray_bootstrap_config.yaml - - ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml - -# Command to start ray on worker nodes. You don't need to change this. -worker_start_ray_commands: - - ray stop - - ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 - #- ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076