# Title: Task YAML — SkyPilot documentation # URL Source: https://docs.skypilot.co/en/latest/reference/yaml-spec.html # Markdown Content: # SkyPilot provides an intuitive YAML interface to specify a task (resource requirements, setup commands, run commands, file mounts, storage mounts, and so on). # Task YAMLs can be used with the [CLI](https://docs.skypilot.co/en/latest/reference/cli.html#cli), or the programmatic API ([`sky.Task.from_yaml()`](https://docs.skypilot.co/en/latest/reference/api.html#sky.Task.from_yaml "sky.Task.from_yaml")). # Available fields: # \# Task name (optional), used for display purposes. # name: my-task # \# Working directory (optional), synced to ~/sky\_workdir on the remote cluster # \# each time launch or exec is run with the yaml file. # # # \# Commands in "setup" and "run" will be executed under it. # # # \# If a relative path is used, it's evaluated relative to the location from # \# which \`sky\` is called. # # # \# To exclude files from syncing, see # \# https://docs.skypilot.co/en/latest/examples/syncing-code-artifacts.html#exclude-uploading-files # workdir: ~/my-task-code # \# Number of nodes (optional; defaults to 1) to launch including the head node. # # # \# A task can set this to a smaller value than the size of a cluster. # num\_nodes: 4 # \# Per-node resource requirements (optional). # resources: # cloud: aws \# The cloud to use (optional). # \# The region to use (optional). Auto-failover will be disabled # \# if this is specified. # region: us-east-1 # \# The zone to use (optional). Auto-failover will be disabled # \# if this is specified. # zone: us-east-1a # \# Accelerator name and count per node (optional). # # # \# Use \`sky show-gpus\` to view available accelerator configurations. # # # \# The following three ways are valid for specifying accelerators for a cluster: # # # \# To specify a single type of accelerator: # \# Format: : (or simply , short for a count of 1). # \# accelerators: H100:4 # # # \# To specify an ordered list of accelerators (try the accelerators in # \# the specified order): # \# Format: \[:, ...\] # \# accelerators: \['L4:1', 'H100:1', 'A100:1'\] # # # \# To specify an unordered set of accelerators (optimize all specified # \# accelerators together, and try accelerator with lowest cost first): # \# Format: {:, ...} # \# accelerators: {'L4:1', 'H100:1', 'A100:1'} # accelerators: H100:8 # \# Number of vCPUs per node (optional). # # # \# Format: # \# : exactly vCPUs # \# +: at least vCPUs # # # \# E.g., 4+ means first try to find an instance type with \>\= 4 vCPUs. If # \# not found, use the next cheapest instance with more than 4 vCPUs. # cpus: 4+ # \# Memory in GiB per node (optional). # # # \# Format: # \# : exactly GiB # \# +: at least GiB # # # \# E.g., 32+ means first try to find an instance type with \>\= 32 GiB. If # \# not found, use the next cheapest instance with more than 32 GiB. # memory: 32+ # \# Instance type to use (optional). If 'accelerators' is specified, # \# the corresponding instance type is automatically inferred. # instance\_type: p3.8xlarge # \# Whether the cluster should use spot instances (optional). # \# If unspecified, defaults to False (on-demand instances). # use\_spot: False # \# The recovery strategy for managed jobs (optional). # # # \# In effect for managed jobs. Possible values are \`FAILOVER\` and \`EAGER\_NEXT\_REGION\`. # # # \# If \`FAILOVER\` is specified, the job will be restarted in the same region # \# if the node fails, and go to the next region if no available resources # \# are found in the same region. # # # \# If \`EAGER\_NEXT\_REGION\` is specified, the job will go to the next region # \# directly if the node fails. This is useful for spot instances, as in # \# practice, preemptions in a region usually indicate a shortage of resources # \# in that region. # # # \# default: EAGER\_NEXT\_REGION # job\_recovery: none # \# Or, to allow up to 3 restarts (default: 0) on user code errors: # \# job\_recovery: # \# strategy: EAGER\_NEXT\_REGION # \# max\_restarts\_on\_errors: 3 # \# Disk size in GB to allocate for OS (mounted at /). Increase this if you # \# have a large working directory or tasks that write out large outputs. # disk\_size: 256 # \# Disk tier to use for OS (optional). # \# Could be one of 'low', 'medium', 'high', 'ultra' or 'best' (default: 'medium'). # \# if 'best' is specified, use the best disk tier enabled. # \# Rough performance estimate: # \# low: 1000 IOPS; read 90 MB/s; write 90 MB/s # \# medium: 3000 IOPS; read 220 MB/s; write 220 MB/s # \# high: 6000 IOPS; read 400 MB/s; write 400 MB/s # \# ultra: 60000 IOPS; read 4000 MB/s; write 3000 MB/s # \# Measured by examples/perf/storage\_rawperf.yaml # disk\_tier: medium # \# Ports to expose (optional). # # # \# All ports specified here will be exposed to the public Internet. Under # \# the hood, a firewall rule / inbound rule is automatically added to allow # \# inbound traffic to these ports. Applies to all VMs of a cluster created # \# with this field set. # # # \# Currently only TCP protocol is supported. # # # \# Ports Lifecycle: # \# A cluster's ports will be updated whenever \`sky launch\` is executed. # \# When launching an existing cluster, any new ports specified will be # \# opened for the cluster, and the firewall rules for old ports will never # \# be removed until the cluster is terminated. # # # \# Could be an integer, a range, or a list of integers and ranges: # \# To specify a single port: # \# ports: 8081 # \# To specify a port range: # \# ports: 10052-10100 # \# To specify multiple ports / port ranges: # \# ports: # \# - 8080 # \# - 10022-10040 # ports: 8081 # \# Additional accelerator metadata (optional); only used for TPU node # \# and TPU VM. # \# Example usage: # # # \# To request a TPU VM: # \# accelerator\_args: # \# tpu\_vm: True (optional, default: True) # # # \# To request a TPU node: # \# accelerator\_args: # \# tpu\_name: ... # \# tpu\_vm: False # # # \# By default, the value for "runtime\_version" is decided based on which is # \# requested and should work for either case. If passing in an incompatible # \# version, GCP will throw an error during provisioning. # accelerator\_args: # \# Default is "tpu-vm-base" for TPU VM and "2.12.0" for TPU node. # runtime\_version: tpu-vm-base # \# tpu\_name: mytpu # \# tpu\_vm: True # True to use TPU VM (the default); False to use TPU node. # \# Custom image id (optional, advanced). The image id used to boot the # \# instances. Only supported for AWS, GCP, OCI and IBM (for non-docker image). # \# If not specified, SkyPilot will use the default debian-based image # \# suitable for machine learning tasks. # # # \# Docker support # \# You can specify docker image to use by setting the image\_id to # \# \`docker:\` for Azure, AWS and GCP. For example, # \# image\_id: docker:ubuntu:latest # \# Currently, only debian and ubuntu images are supported. # \# If you want to use a docker image in a private registry, you can specify your # \# username, password, and registry server as task environment variable. For # \# details, please refer to the \`envs\` section below. # # # \# AWS # \# To find AWS AMI ids: https://leaherb.com/how-to-find-an-aws-marketplace-ami-image-id # \# You can also change the default OS version by choosing from the # \# following image tags provided by SkyPilot: # \# image\_id: skypilot:gpu-ubuntu-2004 # \# image\_id: skypilot:k80-ubuntu-2004 # \# image\_id: skypilot:gpu-ubuntu-1804 # \# image\_id: skypilot:k80-ubuntu-1804 # # # \# It is also possible to specify a per-region image id (failover will only # \# go through the regions specified as keys; useful when you have the # \# custom images in multiple regions): # \# image\_id: # \# us-east-1: ami-0729d913a335efca7 # \# us-west-2: ami-050814f384259894c # # # \# GCP # \# To find GCP images: https://cloud.google.com/compute/docs/images # \# image\_id: projects/deeplearning-platform-release/global/images/common-cpu-v20230615-debian-11-py310 # \# Or machine image: https://cloud.google.com/compute/docs/machine-images # \# image\_id: projects/my-project/global/machineImages/my-machine-image # # # \# Azure # \# To find Azure images: https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage # \# image\_id: microsoft-dsvm:ubuntu-2004:2004:21.11.04 # # # \# OCI # \# To find OCI images: https://docs.oracle.com/en-us/iaas/images # \# You can choose the image with OS version from the following image tags # \# provided by SkyPilot: # \# image\_id: skypilot:gpu-ubuntu-2204 # \# image\_id: skypilot:gpu-ubuntu-2004 # \# image\_id: skypilot:gpu-oraclelinux9 # \# image\_id: skypilot:gpu-oraclelinux8 # \# image\_id: skypilot:cpu-ubuntu-2204 # \# image\_id: skypilot:cpu-ubuntu-2004 # \# image\_id: skypilot:cpu-oraclelinux9 # \# image\_id: skypilot:cpu-oraclelinux8 # # # \# It is also possible to specify your custom image's OCID with OS type, # \# for example: # \# image\_id: ocid1.image.oc1.us-sanjose-1.aaaaaaaaywwfvy67wwe7f24juvjwhyjn3u7g7s3wzkhduxcbewzaeki2nt5q:oraclelinux # \# image\_id: ocid1.image.oc1.us-sanjose-1.aaaaaaaa5tnuiqevhoyfnaa5pqeiwjv6w5vf6w4q2hpj3atyvu3yd6rhlhyq:ubuntu # # # \# IBM # \# Create a private VPC image and paste its ID in the following format: # \# image\_id: # \# To create an image manually: # \# https://cloud.ibm.com/docs/vpc?topic=vpc-creating-and-using-an-image-from-volume. # \# To use an official VPC image creation tool: # \# https://www.ibm.com/cloud/blog/use-ibm-packer-plugin-to-create-custom-images-on-ibm-cloud-vpc-infrastructure # \# To use a more limited but easier to manage tool: # \# https://github.com/IBM/vpc-img-inst # image\_id: ami-0868a20f5a3bf9702 # \# Labels to apply to the instances (optional). # # # \# If specified, these labels will be applied to the VMs or pods created # \# by SkyPilot. These are useful for assigning metadata that may be # \# used by external tools. Implementation depends on the chosen cloud - # \# On AWS, labels map to instance tags. On GCP, labels map to instance # \# labels. On Kubernetes, labels map to pod labels. On other clouds, # \# labels are not supported and will be ignored. # # # \# Note: Labels are applied only on the first launch of the cluster. They # \# are not updated on subsequent launches. # labels: # my-label: my-value # \# Candidate resources (optional). If specified, SkyPilot will only use # \# these candidate resources to launch the cluster. The fields specified # \# outside of \`any\_of\`, \`ordered\` will be used as the default values for # \# all candidate resources, and any duplicate fields specified inside # \# \`any\_of\`, \`ordered\` will override the default values. # \# \`any\_of:\` means that SkyPilot will try to find a resource that matches # \# any of the candidate resources, i.e. the failover order will be decided # \# by the optimizer. # \# \`ordered:\` means that SkyPilot will failover through the candidate # \# resources with the specified order. # \# Note: accelerators under \`any\_of\` and \`ordered\` cannot be a list or set. # any\_of: # \- cloud: aws # region: us-west-2 # accelerators: H100 # \- cloud: gcp # accelerators: H100 # \# Environment variables (optional). These values can be accessed in the # \# \`file\_mounts\`, \`setup\`, and \`run\` sections below. # # # \# Values set here can be overridden by a CLI flag: # \# \`sky launch/exec --env ENV=val\` (if ENV is present). # # # \# If you want to use a docker image as runtime environment in a private # \# registry, you can specify your username, password, and registry server as # \# task environment variable. For example: # \# envs: # \# SKYPILOT\_DOCKER\_USERNAME: # \# SKYPILOT\_DOCKER\_PASSWORD: # \# SKYPILOT\_DOCKER\_SERVER: # # # \# SkyPilot will execute \`docker login --username --password # \# \` before pulling the docker image. For \`docker # \# login\`, see https://docs.docker.com/engine/reference/commandline/login/ # # # \# You could also specify any of them through the CLI flag if you don't want # \# to store them in your yaml file or if you want to generate them for # \# constantly changing password. For example: # \# sky launch --env SKYPILOT\_DOCKER\_PASSWORD=$(aws ecr get-login-password --region us-east-1). # # # \# For more information about docker support in SkyPilot, please refer to the \`image\_id\` section above. # envs: # MY\_BUCKET: skypilot-temp-gcs-test # MY\_LOCAL\_PATH: tmp-workdir # MODEL\_SIZE: 13b # file\_mounts: # \# Uses rsync to sync local files/directories to all nodes of the cluster. # # # \# If a relative path is used, it's evaluated relative to the location from # \# which \`sky\` is called. # # # \# If symlinks are present, they are copied as symlinks, and their targets # \# must also be synced using file\_mounts to ensure correctness. # /remote/dir1/file: /local/dir1/file # /remote/dir2: /local/dir2 # \# Create a S3 bucket named sky-dataset, uploads the contents of # \# /local/path/datasets to the bucket, and marks the bucket as persistent # \# (it will not be deleted after the completion of this task). # \# Symlinks and their contents are NOT copied. # # # \# Mounts the bucket at /datasets-storage on every node of the cluster. # /datasets-storage: # name: sky-dataset \# Name of storage, optional when source is bucket URI # source: /local/path/datasets \# Source path, can be local or bucket URI. Optional, do not specify to create an empty bucket. # store: s3 \# Could be either 's3', 'gcs', 'azure', 'r2', 'oci', or 'ibm'; default: None. Optional. # persistent: True \# Defaults to True; can be set to false to delete bucket after cluster is downed. Optional. # mode: MOUNT \# Either MOUNT or COPY. Defaults to MOUNT. Optional. # \# Copies a cloud object store URI to the cluster. Can be private buckets. # /datasets-s3: s3://my-awesome-dataset # \# Demoing env var usage. # /checkpoint/${MODEL\_SIZE}: ~/${MY\_LOCAL\_PATH} # /mydir: # name: ${MY\_BUCKET} \# Name of the bucket. # mode: MOUNT # \# Setup script (optional) to execute on every \`sky launch\`. # \# This is executed before the 'run' commands. # # # \# The '|' separator indicates a multiline string. To specify a single command: # \# setup: pip install -r requirements.txt # setup: | # echo "Begin setup." # pip install -r requirements.txt # echo "Setup complete." # \# Main program (optional, but recommended) to run on every node of the cluster. # run: | # echo "Beginning task." # python train.py # \# Demoing env var usage. # echo Env var MODEL\_SIZE has value: ${MODEL\_SIZE} # Experimental Configurations[#](https://docs.skypilot.co/en/latest/reference/yaml-spec.html#experimental-configurations "Permalink to this heading") # --------------------------------------------------------------------------------------------------------------------------------------------------- # Note # Experimental features and APIs may be changed or removed without any notice. # In additional to the above fields, SkyPilot also supports the following experimental fields in the task YAML: # experimental: # \# Override the configs in ~/.sky/config.yaml from a task level. # # # \# The following fields can be overridden. Please refer to docs of Advanced # \# Configuration for more details of those fields: # \# https://docs.skypilot.co/en/latest/reference/config.html # config\_overrides: # docker: # run\_options: ... # kubernetes: # pod\_config: ... # provision\_timeout: ... # gcp: # managed\_instance\_group: ... # nvidia\_gpus: # disable\_ecc: ... workdir: . resources: accelerators: ["H100:1", "A100-80GB:1"] ports: - 7999 # main ART server - 8000 # vLLM server envs: ALL_EXTRAS: "false" GIT_USER_NAME: "Your Name" GIT_USER_EMAIL: "your.email@example.com" GITHUB_TOKEN: "" setup: | ./scripts/setup.sh