From b4e6e90f1d775059f4097857e65a6a3cb256b124 Mon Sep 17 00:00:00 2001 From: Fred Park Date: Thu, 8 Sep 2016 11:03:40 -0700 Subject: [PATCH] Add FFmpeg GPU recipe - Fix NV-series provisioning - Fix up various READMEs - Add maintained by tags in Dockerfiles - Add missing config flag in jobs json - Fix non-Docker shipyard azure-storage req --- CHANGELOG.md | 2 +- Dockerfile | 1 + README.md | 8 +-- config_templates/jobs.json | 1 + docs/00-introduction.md | 17 +++--- docs/02-batch-shipyard-configuration.md | 10 ++-- recipes/CNTK-CPU-OpenMPI/docker/Dockerfile | 1 + recipes/CNTK-GPU-OpenMPI/README.md | 7 +-- recipes/CNTK-GPU-OpenMPI/docker/Dockerfile | 1 + recipes/Caffe-GPU/README.md | 7 +-- recipes/Caffe-GPU/docker/Dockerfile | 1 + recipes/FFmpeg-GPU/README.md | 53 +++++++++++++++++++ .../docker/Dockerfile | 1 + recipes/NAMD-TCP/docker/Dockerfile | 1 + recipes/README.md | 10 ++-- recipes/TensorFlow-GPU/README.md | 7 +-- scripts/shipyard_nodeprep.sh | 26 +++++---- 17 files changed, 111 insertions(+), 43 deletions(-) create mode 100644 recipes/FFmpeg-GPU/README.md diff --git a/CHANGELOG.md b/CHANGELOG.md index da314bef..2986a27c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,7 +6,7 @@ ### Added - Transparent GPU support for Azure N-Series VMs - New recipes added: Caffe-GPU, CNTK-CPU-OpenMPI, CNTK-GPU-OpenMPI, -NAMD-Infiniband-IntelMPI, NAMD-TCP, TensorFlow-GPU +FFmpeg-GPU, NAMD-Infiniband-IntelMPI, NAMD-TCP, TensorFlow-GPU ### Changed - Multi-instance tasks now automatically complete their job by default. This diff --git a/Dockerfile b/Dockerfile index 1c13f402..37e39bf2 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,6 +1,7 @@ # Dockerfile for Azure/batch-shipyard FROM gliderlabs/alpine:3.4 +MAINTAINER Fred Park # set environment variables # currently libtorrent-rasterbar 1.1.0+ DHT implementations are broken diff --git a/README.md b/README.md index 0eae1e98..fa8fd28e 100644 --- a/README.md +++ b/README.md @@ -61,12 +61,8 @@ file via the command `pip install --user -r requirements.txt` (or via `pip3` for python3). ## Batch Shipyard Compute Node OS Support -Batch Shipyard is currently only compatible with Linux Batch Compute Pools -configured via -[VirtualMachineConfiguration](http://azure-sdk-for-python.readthedocs.io/en/latest/_modules/azure/batch/models/virtual_machine_configuration.html). -Please see the list of -[Azure Batch supported Marketplace Linux VMs](https://azure.microsoft.com/en-us/documentation/articles/batch-linux-nodes/#list-of-virtual-machine-images) -for use with Batch Shipyard. +Batch Shipyard is currently only compatible with +[Azure Batch supported Marketplace Linux VMs](https://azure.microsoft.com/en-us/documentation/articles/batch-linux-nodes/#list-of-virtual-machine-images). ## Documentation Please refer to diff --git a/config_templates/jobs.json b/config_templates/jobs.json index 580a4a1f..f4c980bb 100644 --- a/config_templates/jobs.json +++ b/config_templates/jobs.json @@ -2,6 +2,7 @@ "job_specifications": [ { "id": "dockerjob", + "multi_instance_auto_complete": true, "environment_variables": { "abc": "xyz" }, diff --git a/docs/00-introduction.md b/docs/00-introduction.md index 3a8c39a9..af4b4ee3 100644 --- a/docs/00-introduction.md +++ b/docs/00-introduction.md @@ -9,9 +9,12 @@ job scheduling system leveraging the their jobs are (e.g., executing a binary to process text data), when to run them, where to run them, and on what VM resources they are run on. The Azure Batch service takes care of the rest including: compute resource provisioning, -task scheduling, automatic task recovery and retry on failure, and automatic -scaling of resources if specified. Costs are incurred only for compute -resources consumed. +task scheduling, automatic task recovery and retry on failure, automatic +scaling of resources if specified, and many other complexities that exist +at cloud-scale. Costs are incurred only for compute resources consumed, i.e., +the same baseline prices for +[Virtual Machines](https://azure.microsoft.com/en-us/pricing/details/virtual-machines/) +or [Cloud Services](https://azure.microsoft.com/en-us/pricing/details/cloud-services/). Azure Batch can handle workloads on any point of the parallel and distributed processing spectrum, from embarassingly parallel workloads all the way to @@ -23,7 +26,7 @@ schedule work on machines. Compute resources: ``` -Azure Subscription --> Batch Account --> Compute Pool --> Compute Node +Azure Subscription --> Batch Account --> Compute Pool --> Compute Nodes ``` Batch accounts are provisioned from a valid Azure Subscription. With a @@ -34,7 +37,7 @@ pools can be provisioned per Batch account. Compute jobs: ``` -Job --> Task --> Subtask (or tasklet) +Job --> Tasks --> Subtasks (or tasklets) ``` Jobs are run on compute pools for which tasks are scheduled on to compute @@ -47,8 +50,8 @@ Files required as part of a task or generated as a side-effect of a task can be referenced using a compute job heirarchy or a compute node heirarchy (if the absolute file location is known). Files existing on compute nodes can be transferred to any accessible endpoint, including Azure Storage. Files -may also be fetched from live compute nodes (nodes that have not yet been -deleted). +may also be fetched from live compute nodes (i.e., nodes that have not yet +been deleted). A high level overview of Azure Batch service basics can be found [here](https://azure.microsoft.com/en-us/documentation/articles/batch-technical-overview/). diff --git a/docs/02-batch-shipyard-configuration.md b/docs/02-batch-shipyard-configuration.md index a22015d2..fd3e8e6c 100644 --- a/docs/02-batch-shipyard-configuration.md +++ b/docs/02-batch-shipyard-configuration.md @@ -153,13 +153,15 @@ replication mechanism between compute nodes within a compute pool. The to allow unfettered concurrent downloading from the source registry among all compute nodes. The following options apply to `peer_to_peer` data replication options: -* `enabled` property enables or disables peer-to-peer transfer. +* `enabled` property enables or disables private peer-to-peer transfer. Note +that for compute pools with a relatively small number of VMs, peer-to-peer +transfer may not provide any benefit. * `compression` property enables or disables compression of image files. It - is strongly recommended to keep this enabled. +is strongly recommended to keep this enabled. * `concurrent_source_downloads` property specifies the number of simultaneous downloads allowed to each image. -* `direct_download_seed_bias` property sets the number of seeds to prefer -per image. +* `direct_download_seed_bias` property sets the number of direct download +seeds to prefer per image before switching to peer-to-peer transfer. The `global_resources` property contains the Docker image and volume configuration. `docker_images` is an array of docker images that should diff --git a/recipes/CNTK-CPU-OpenMPI/docker/Dockerfile b/recipes/CNTK-CPU-OpenMPI/docker/Dockerfile index 46cb744c..e94bebf4 100644 --- a/recipes/CNTK-CPU-OpenMPI/docker/Dockerfile +++ b/recipes/CNTK-CPU-OpenMPI/docker/Dockerfile @@ -1,6 +1,7 @@ # Dockerfile for CNTK-CPU-OpenMPI for use with Batch Shipyard on Azure Batch FROM ubuntu:14.04 +MAINTAINER Fred Park # install base system COPY ssh_config /root/.ssh/ diff --git a/recipes/CNTK-GPU-OpenMPI/README.md b/recipes/CNTK-GPU-OpenMPI/README.md index 3007b3b4..a02968e8 100644 --- a/recipes/CNTK-GPU-OpenMPI/README.md +++ b/recipes/CNTK-GPU-OpenMPI/README.md @@ -6,14 +6,15 @@ GPUs using N-series Azure VM instances in an Azure Batch compute pool. ### Pool Configuration **Note: You must be approved for the [Azure N-Series Preview](http://gpu.azure.com/) and have escalated a -customer service support ticket to the Azure Batch team to enable this -feature. Otherwise, your pool allocation will fail.** +customer service support ticket with your Batch account details to the Azure +Batch team to enable this feature. Otherwise, your pool allocation will fail.** The pool configuration should enable the following properties: * `vm_size` must be one of `STANDARD_NC6`, `STANDARD_NC12`, `STANDARD_NC24`, `STANDARD_NV6`, `STANDARD_NV12`, `STANDARD_NV24`. `NC` VM instances feature K80 GPUs for GPU compute acceleration while `NV` VM instances feature -M60 GPUs for visualization workloads. +M60 GPUs for visualization workloads. Because CNTK is a GPU-accelerated +compute application, it is best to choose `NC` VM instances. * `publisher` should be `Canonical`. Other publishers will be supported once they are available for N-series VMs. * `offer` should be `UbuntuServer`. Other offers will be supported once they diff --git a/recipes/CNTK-GPU-OpenMPI/docker/Dockerfile b/recipes/CNTK-GPU-OpenMPI/docker/Dockerfile index 666c890e..c96f630a 100644 --- a/recipes/CNTK-GPU-OpenMPI/docker/Dockerfile +++ b/recipes/CNTK-GPU-OpenMPI/docker/Dockerfile @@ -1,6 +1,7 @@ # Dockerfile for CNTK-GPU-OpenMPI for use with Batch Shipyard on Azure Batch FROM nvidia/cuda:7.5-cudnn5-devel +MAINTAINER Fred Park # install base system COPY ssh_config /root/.ssh/ diff --git a/recipes/Caffe-GPU/README.md b/recipes/Caffe-GPU/README.md index 47f32ba6..517bee6a 100644 --- a/recipes/Caffe-GPU/README.md +++ b/recipes/Caffe-GPU/README.md @@ -6,14 +6,15 @@ GPUs using N-series Azure VM instances in an Azure Batch compute pool. ### Pool Configuration **Note: You must be approved for the [Azure N-Series Preview](http://gpu.azure.com/) and have escalated a -customer service support ticket to the Azure Batch team to enable this -feature. Otherwise, your pool allocation will fail.** +customer service support ticket with your Batch account details to the Azure +Batch team to enable this feature. Otherwise, your pool allocation will fail.** The pool configuration should enable the following properties: * `vm_size` must be one of `STANDARD_NC6`, `STANDARD_NC12`, `STANDARD_NC24`, `STANDARD_NV6`, `STANDARD_NV12`, `STANDARD_NV24`. `NC` VM instances feature K80 GPUs for GPU compute acceleration while `NV` VM instances feature -M60 GPUs for visualization workloads. +M60 GPUs for visualization workloads. Because Caffe is a GPU-accelerated +compute application, it is best to choose `NC` VM instances. * `publisher` should be `Canonical`. Other publishers will be supported once they are available for N-series VMs. * `offer` should be `UbuntuServer`. Other offers will be supported once they diff --git a/recipes/Caffe-GPU/docker/Dockerfile b/recipes/Caffe-GPU/docker/Dockerfile index 69027112..977c73d2 100644 --- a/recipes/Caffe-GPU/docker/Dockerfile +++ b/recipes/Caffe-GPU/docker/Dockerfile @@ -1,6 +1,7 @@ # Dockerfile for Caffe-GPU for use with Batch Shipyard on Azure Batch FROM nvidia/cuda:7.5-cudnn5-devel-ubuntu14.04 +MAINTAINER Fred Park RUN apt-get update && apt-get install -y --no-install-recommends \ build-essential \ diff --git a/recipes/FFmpeg-GPU/README.md b/recipes/FFmpeg-GPU/README.md new file mode 100644 index 00000000..96b7bf76 --- /dev/null +++ b/recipes/FFmpeg-GPU/README.md @@ -0,0 +1,53 @@ +# FFmpeg-GPU +This recipe shows how to run [FFmpeg](https://ffmpeg.org/) for using +hardware-accelerated encoding/transcoding on GPUs using N-series Azure VM +instances in an Azure Batch compute pool. + +## Configuration +### Pool Configuration +**Note: You must be approved for the +[Azure N-Series Preview](http://gpu.azure.com/) and have escalated a +customer service support ticket with your Batch account details to the Azure +Batch team to enable this feature. Otherwise, your pool allocation will fail.** + +The pool configuration should enable the following properties: +* `vm_size` must be one of `STANDARD_NC6`, `STANDARD_NC12`, `STANDARD_NC24`, +`STANDARD_NV6`, `STANDARD_NV12`, `STANDARD_NV24`. `NC` VM instances feature +K80 GPUs for GPU compute acceleration while `NV` VM instances feature +M60 GPUs for visualization workloads. Because FFmpeg is for transforming +audio/video, it is best to choose `NV` VM instances. +* `publisher` should be `Canonical`. Other publishers will be supported +once they are available for N-series VMs. +* `offer` should be `UbuntuServer`. Other offers will be supported once they +are available for N-series VMs. +* `sku` should be `16.04.0-LTS`. Other skus will be supported once they are +available for N-series VMs. + +### Global Configuration +The global configuration should set the following properties: +* `docker_images` array must have a reference to a valid FFmpeg NVENC +GPU-enabled Docker image. The +[alfpark/ffmpeg:3.1.3-nvenc](https://hub.docker.com/r/alfpark/ffmpeg) +can be used for this recipe. + +### Jobs Configuration +The jobs configuration should set the following properties within the `tasks` +array which should have a task definition containing: +* `image` should be the name of the Docker image for this container invocation, +e.g., `alfpark/ffmpeg:3.1.3-nvenc` +* `command` should contain the command to pass to the Docker run invocation. +The following command takes an mp4 video file and transcodes it to H.265/HEVC +using NVENC transcode offload on to the GPU: +`"-i samplevideo.mp4 -c:v hevc_nvenc -preset default output.mp4"` + * `samplevideo.mp4` should be a file that is accessible by the task. You + can utilize the `resource_files` on the + [task configuration](../../docs/02-batch-shipyard-configuration.md) for + any number of files to be available to the task for processing. + * `hevc_nvenc` informs FFmpeg to use the H.256/HEVC NVENC encoder. To + encode with H.264 using NVENC specify `h264_nvenc` instead. +* `gpu` must be set to `true`. This enables invoking the `nvidia-docker` +wrapper. + +## Dockerfile and supplementary files +The `Dockerfile` for the Docker image referenced above can be found +[here](https://github.com/alfpark/docker-ffmpeg/blob/master/nvenc/). diff --git a/recipes/NAMD-Infiniband-IntelMPI/docker/Dockerfile b/recipes/NAMD-Infiniband-IntelMPI/docker/Dockerfile index 6ef13699..aa0e6319 100644 --- a/recipes/NAMD-Infiniband-IntelMPI/docker/Dockerfile +++ b/recipes/NAMD-Infiniband-IntelMPI/docker/Dockerfile @@ -1,6 +1,7 @@ # Dockerfile for NAMD-Infiniband for use with Batch Shipyard on Azure Batch FROM centos:7.1.1503 +MAINTAINER Fred Park # set up base COPY ssh_config /root/.ssh/ diff --git a/recipes/NAMD-TCP/docker/Dockerfile b/recipes/NAMD-TCP/docker/Dockerfile index 4558d21f..e7e0e5d2 100644 --- a/recipes/NAMD-TCP/docker/Dockerfile +++ b/recipes/NAMD-TCP/docker/Dockerfile @@ -1,6 +1,7 @@ # Dockerfile for NAMD-TCP for use with Batch Shipyard on Azure Batch FROM centos:7.1.1503 +MAINTAINER Fred Park # set up base and ssh keys COPY ssh_config /root/.ssh/ diff --git a/recipes/README.md b/recipes/README.md index c090fc92..d41f5389 100644 --- a/recipes/README.md +++ b/recipes/README.md @@ -1,6 +1,6 @@ # Batch Shipyard Recipes This directory contains recipes and sample batch-style Docker workloads for -use with Batch Shipyard. +use with Batch Shipyard on Azure Batch. **NOTE: Not all recipes are populated.** @@ -45,7 +45,7 @@ This NAMD-TCP recipe contains information on how to Dockerize distributed TBC. [OpenFoam](http://www.openfoam.com/) -## Video Processing -### FFmpeg -TBC. -[FFmpeg](https://ffmpeg.org/) +## Audio/Video Processing +### [FFmpeg-GPU](./FFmpeg-GPU) +This recipe contains information on how to use Dockerized +[FFmpeg](https://ffmpeg.org/) on GPUs for use with the N-series Azure VMs. diff --git a/recipes/TensorFlow-GPU/README.md b/recipes/TensorFlow-GPU/README.md index 96a9f8d7..3b2523be 100644 --- a/recipes/TensorFlow-GPU/README.md +++ b/recipes/TensorFlow-GPU/README.md @@ -6,14 +6,15 @@ using N-series Azure VM instances in an Azure Batch compute pool. ### Pool Configuration **Note: You must be approved for the [Azure N-Series Preview](http://gpu.azure.com/) and have escalated a -customer service support ticket to the Azure Batch team to enable this -feature. Otherwise, your pool allocation will fail.** +customer service support ticket with your Batch account details to the Azure +Batch team to enable this feature. Otherwise, your pool allocation will fail.** The pool configuration should enable the following properties: * `vm_size` must be one of `STANDARD_NC6`, `STANDARD_NC12`, `STANDARD_NC24`, `STANDARD_NV6`, `STANDARD_NV12`, `STANDARD_NV24`. `NC` VM instances feature K80 GPUs for GPU compute acceleration while `NV` VM instances feature -M60 GPUs for visualization workloads. +M60 GPUs for visualization workloads. Because TensorFlow is a GPU-accelerated +compute application, it is best to choose `NC` VM instances. * `publisher` should be `Canonical`. Other publishers will be supported once they are available for N-series VMs. * `offer` should be `UbuntuServer`. Other offers will be supported once they diff --git a/scripts/shipyard_nodeprep.sh b/scripts/shipyard_nodeprep.sh index 79173c80..2bf27f27 100755 --- a/scripts/shipyard_nodeprep.sh +++ b/scripts/shipyard_nodeprep.sh @@ -38,7 +38,6 @@ offer= p2p= prefix= privatereg= -reboot=0 sku= while getopts "h?ab:dg:no:p:r:s:t:" opt; do @@ -49,7 +48,7 @@ while getopts "h?ab:dg:no:p:r:s:t:" opt; do echo "-a install azurefile docker volume driver" echo "-b [resources] block until resources loaded" echo "-d use docker container for cascade" - echo "-g [reboot:driver version:driver file:nvidia docker pkg] gpu support" + echo "-g [nv-series:driver version:driver file:nvidia docker pkg] gpu support" echo "-n optimize network TCP settings" echo "-o [offer] VM offer" echo "-p [prefix] storage container prefix" @@ -69,7 +68,7 @@ while getopts "h?ab:dg:no:p:r:s:t:" opt; do cascadecontainer=1 ;; g) - gpu=${OPTARG,,} + gpu=$OPTARG ;; n) networkopt=1 @@ -231,8 +230,19 @@ if [ $offer == "ubuntuserver" ] || [ $offer == "debian" ]; then if [ ! -z $gpu ] && [ ! -f $nodeprepfinished ]; then # split arg into two IFS=':' read -ra GPUARGS <<< "$gpu" + # take special actions if we're on NV-series VMs if [ ${GPUARGS[0]} == "True" ]; then - reboot=1 + # remove nouveau + apt-get --purge remove xserver-xorg-video-nouveau + rmmod nouveau + # blacklist nouveau from being loaded if rebooted +cat > /etc/modprobe.d/blacklist-nouveau.conf << EOF +blacklist nouveau +blacklist lbm-nouveau +options nouveau modeset=0 +alias nouveau off +alias lbm-nouveau off +EOF fi nvdriverver=${GPUARGS[1]} nvdriver=${GPUARGS[2]} @@ -274,7 +284,7 @@ if [ $offer == "ubuntuserver" ] || [ $offer == "debian" ]; then if [ $cascadecontainer -eq 0 ]; then # install azure storage python dependency apt-get install -y -q python3-pip - pip3 install --no-cache-dir azure-storage==0.32.0 + pip3 install --no-cache-dir azure-storage==0.33.0 # backfill node prep start if [ ! -z ${CASCADE_TIMING+x} ]; then ./perf.py nodeprep start $prefix --ts $npstart --message "offer=$offer,sku=$sku" @@ -426,12 +436,6 @@ fi # touch file to prevent subsequent perf recording if rebooted touch $nodeprepfinished -# reboot node if specified -if [ $reboot -eq 1 ]; then - echo "rebooting node" - reboot -fi - # execute cascade if [ $cascadecontainer -eq 1 ]; then detached=