Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
E2E: Replace outdated images with latest ones
Signed-off-by: Yuki Iwai <yuki.iwai.tz@gmail.com>
  • Loading branch information
tenzen-y committed Apr 25, 2024
commit a798f4793f260b42d3f8e5bee80453d17ce32fb7
3 changes: 3 additions & 0 deletions .github/workflows/build-and-publish-images.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,9 @@ jobs:
- name: Checkout
uses: actions/checkout@v3

- name: Free-Up Disk Space
uses: ./.github/workflows/free-up-disk-space

- name: Docker Login
# Trigger workflow only for kubeflow/training-operator repository with specific branch (master, v.*-branch) or tag (v.*).
if: >-
Expand Down
49 changes: 49 additions & 0 deletions .github/workflows/free-up-disk-space/action.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
name: Free-Up Disk Space
description: Remove Non-Essential Tools And Move Docker Data Directory to /mnt/docker

runs:
using: composite
steps:
# This step is a Workaround to avoid the "No space left on device" error.
# ref: https://github.com/actions/runner-images/issues/2840
- name: Remove unnecessary files
shell: bash
run: |
echo "Disk usage before cleanup:"
df -hT

sudo rm -rf /usr/share/dotnet
sudo rm -rf /opt/ghc
sudo rm -rf /usr/local/share/boost
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
sudo rm -rf /usr/local/lib/android
sudo rm -rf /usr/local/share/powershell
sudo rm -rf /usr/share/swift

echo "Disk usage after cleanup:"
df -hT

- name: Prune docker images
shell: bash
run: |
docker image prune -a -f
docker system df
df -hT

- name: Move docker data directory
shell: bash
run: |
echo "Stopping docker service ..."
sudo systemctl stop docker
DOCKER_DEFAULT_ROOT_DIR=/var/lib/docker
DOCKER_ROOT_DIR=/mnt/docker
echo "Moving ${DOCKER_DEFAULT_ROOT_DIR} -> ${DOCKER_ROOT_DIR}"
sudo mv ${DOCKER_DEFAULT_ROOT_DIR} ${DOCKER_ROOT_DIR}
echo "Creating symlink ${DOCKER_DEFAULT_ROOT_DIR} -> ${DOCKER_ROOT_DIR}"
sudo ln -s ${DOCKER_ROOT_DIR} ${DOCKER_DEFAULT_ROOT_DIR}
echo "$(sudo ls -l ${DOCKER_DEFAULT_ROOT_DIR})"
echo "Starting docker service ..."
sudo systemctl daemon-reload
sudo systemctl start docker
echo "Docker service status:"
sudo systemctl --no-pager -l -o short status docker
19 changes: 3 additions & 16 deletions .github/workflows/integration-tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -55,25 +55,12 @@ jobs:
python-version: "3.10"

steps:
# This step is a Workaround to avoid the "No space left on device" error.
# ref: https://github.com/actions/runner-images/issues/2840
- name: Remove unnecessary files
shell: bash
run: |
sudo rm -rf /usr/share/dotnet
sudo rm -rf /opt/ghc
sudo rm -rf "/usr/local/share/boost"
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
sudo rm -rf /usr/local/lib/android
sudo rm -rf /usr/local/share/powershell
sudo rm -rf /usr/share/swift

echo "Disk usage after cleanup:"
df -h

- name: Checkout
uses: actions/checkout@v3

- name: Free-Up Disk Space
uses: ./.github/workflows/free-up-disk-space

- name: Setup Python
uses: actions/setup-python@v4
with:
Expand Down
44 changes: 0 additions & 44 deletions .github/workflows/template-publish-image/action.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,50 +23,6 @@ inputs:
runs:
using: composite
steps:
# This step is a Workaround to avoid the "No space left on device" error.
# ref: https://github.com/actions/runner-images/issues/2840
- name: Remove unnecessary files
shell: bash
run: |
echo "Disk usage before cleanup:"
df -hT

sudo rm -rf /usr/share/dotnet
sudo rm -rf /opt/ghc
sudo rm -rf /usr/local/share/boost
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
sudo rm -rf /usr/local/lib/android
sudo rm -rf /usr/local/share/powershell
sudo rm -rf /usr/share/swift

echo "Disk usage after cleanup:"
df -hT

- name: Prune docker images
shell: bash
run: |
docker image prune -a -f
docker system df
df -hT

- name: Move docker data directory
shell: bash
run: |
echo "Stopping docker service ..."
sudo systemctl stop docker
DOCKER_DEFAULT_ROOT_DIR=/var/lib/docker
DOCKER_ROOT_DIR=/mnt/docker
echo "Moving ${DOCKER_DEFAULT_ROOT_DIR} -> ${DOCKER_ROOT_DIR}"
sudo mv ${DOCKER_DEFAULT_ROOT_DIR} ${DOCKER_ROOT_DIR}
echo "Creating symlink ${DOCKER_DEFAULT_ROOT_DIR} -> ${DOCKER_ROOT_DIR}"
sudo ln -s ${DOCKER_ROOT_DIR} ${DOCKER_DEFAULT_ROOT_DIR}
echo "$(sudo ls -l ${DOCKER_DEFAULT_ROOT_DIR})"
echo "Starting docker service ..."
sudo systemctl daemon-reload
sudo systemctl start docker
echo "Docker service status:"
sudo systemctl --no-pager -l -o short status docker

- name: Setup QEMU
uses: docker/setup-qemu-action@v2
with:
Expand Down
6 changes: 3 additions & 3 deletions examples/mpi/tensorflow-mnist.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ spec:
template:
spec:
containers:
- image: horovod/horovod:0.20.0-tf2.3.0-torch1.6.0-mxnet1.5.0-py3.7-cpu
- image: horovod/horovod:0.28.1
name: mpi
command:
- mpirun
Expand All @@ -35,7 +35,7 @@ spec:
- btl
- ^openib
- python
- /examples/tensorflow2_mnist.py
- /horovod/examples/tensorflow2/tensorflow2_mnist.py
resources:
limits:
cpu: 1
Expand All @@ -45,7 +45,7 @@ spec:
template:
spec:
containers:
- image: horovod/horovod:0.20.0-tf2.3.0-torch1.6.0-mxnet1.5.0-py3.7-cpu
- image: horovod/horovod:0.28.1
name: mpi
resources:
limits:
Expand Down
8 changes: 4 additions & 4 deletions examples/pytorch/mnist/v1/pytorch_job_mnist_gloo.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,10 @@ spec:
spec:
containers:
- name: pytorch
image: gcr.io/<your_project>/pytorch_dist_mnist:latest
image: kubeflow/pytorch-dist-mnist:latest
args: ["--backend", "gloo"]
# Comment out the below resources to use the CPU.
resources:
resources:
limits:
nvidia.com/gpu: 1
Worker:
Expand All @@ -24,9 +24,9 @@ spec:
spec:
containers:
- name: pytorch
image: gcr.io/<your_project>/pytorch_dist_mnist:latest
image: kubeflow/pytorch-dist-mnist:latest
args: ["--backend", "gloo"]
# Comment out the below resources to use the CPU.
resources:
resources:
limits:
nvidia.com/gpu: 1
4 changes: 2 additions & 2 deletions examples/pytorch/mnist/v1/pytorch_job_mnist_mpi.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ spec:
spec:
containers:
- name: pytorch
image: gcr.io/<your_project>/pytorch_dist_mnist:mpi
image: kubeflow/pytorch-dist-mnist:latest
args: ["--backend", "mpi"]
# Comment out the below resources to use the CPU.
resources:
Expand All @@ -24,7 +24,7 @@ spec:
spec:
containers:
- name: pytorch
image: gcr.io/<your_project>/pytorch_dist_mnist:mpi
image: kubeflow/pytorch-dist-mnist:latest
args: ["--backend", "mpi"]
# Comment out the below resources to use the CPU.
resources:
Expand Down
4 changes: 2 additions & 2 deletions examples/pytorch/mnist/v1/pytorch_job_mnist_nccl.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ spec:
spec:
containers:
- name: pytorch
image: gcr.io/<your_project>/pytorch_dist_mnist:latest
image: kubeflow/pytorch-dist-mnist:latest
args: ["--backend", "nccl"]
resources:
limits:
Expand All @@ -23,7 +23,7 @@ spec:
spec:
containers:
- name: pytorch
image: gcr.io/<your_project>/pytorch_dist_mnist:latest
image: kubeflow/pytorch-dist-mnist:latest
args: ["--backend", "nccl"]
resources:
limits:
Expand Down
13 changes: 6 additions & 7 deletions sdk/python/test/e2e/test_e2e_mpijob.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
logging.getLogger("kubeflow.training.api.training_client").setLevel(logging.DEBUG)

TRAINING_CLIENT = TrainingClient(job_kind=constants.MPIJOB_KIND)
JOB_NAME = "mpijob-mxnet-ci-test"
JOB_NAME = "mpijob-pytorch-ci-test"
CONTAINER_NAME = "mpi"
GANG_SCHEDULER_NAME = os.getenv(TEST_GANG_SCHEDULER_NAME_ENV_KEY, "")

Expand Down Expand Up @@ -182,7 +182,7 @@ def generate_mpijob(
def generate_containers() -> Tuple[V1Container, V1Container]:
launcher_container = V1Container(
name=CONTAINER_NAME,
image="horovod/horovod:0.20.0-tf2.3.0-torch1.6.0-mxnet1.5.0-py3.7-cpu",
image="horovod/horovod:0.28.1",
command=["mpirun"],
args=[
"-np",
Expand All @@ -202,19 +202,18 @@ def generate_containers() -> Tuple[V1Container, V1Container]:
"-mca",
"btl",
"^openib",
# "python", "/examples/tensorflow2_mnist.py"]
"python",
"/examples/pytorch_mnist.py",
"/horovod/examples/pytorch/pytorch_mnist.py",
"--epochs",
"1",
],
resources=V1ResourceRequirements(limits={"memory": "1Gi", "cpu": "0.4"}),
)

worker_container = V1Container(
name="mpi",
image="horovod/horovod:0.20.0-tf2.3.0-torch1.6.0-mxnet1.5.0-py3.7-cpu",
resources=V1ResourceRequirements(limits={"memory": "1Gi", "cpu": "0.4"}),
name=CONTAINER_NAME,
image="horovod/horovod:0.28.1",
resources=V1ResourceRequirements(limits={"memory": "3Gi", "cpu": "1.2"}),
)

return launcher_container, worker_container
6 changes: 3 additions & 3 deletions sdk/python/test/e2e/test_e2e_mxjob.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,21 +233,21 @@ def generate_containers() -> Tuple[V1Container, V1Container, V1Container]:
"dist_sync",
],
ports=[V1ContainerPort(container_port=9991, name="mxjob-port")],
resources=V1ResourceRequirements(limits={"memory": "1Gi", "cpu": "0.25"}),
resources=V1ResourceRequirements(limits={"memory": "2Gi", "cpu": "0.8"}),
)

server_container = V1Container(
name=CONTAINER_NAME,
image="docker.io/kubeflow/mxnet-gpu:latest",
ports=[V1ContainerPort(container_port=9991, name="mxjob-port")],
resources=V1ResourceRequirements(limits={"memory": "1Gi", "cpu": "0.25"}),
resources=V1ResourceRequirements(limits={"memory": "1Gi", "cpu": "0.4"}),
)

scheduler_container = V1Container(
name=CONTAINER_NAME,
image="docker.io/kubeflow/mxnet-gpu:latest",
ports=[V1ContainerPort(container_port=9991, name="mxjob-port")],
resources=V1ResourceRequirements(limits={"memory": "1Gi", "cpu": "0.25"}),
resources=V1ResourceRequirements(limits={"memory": "1Gi", "cpu": "0.4"}),
)

return worker_container, server_container, scheduler_container
2 changes: 1 addition & 1 deletion sdk/python/test/e2e/test_e2e_paddlejob.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,5 +158,5 @@ def generate_container() -> V1Container:
image="docker.io/paddlepaddle/paddle:2.4.0rc0-cpu",
command=["python"],
args=["-m", "paddle.distributed.launch", "run_check"],
resources=V1ResourceRequirements(limits={"memory": "1Gi", "cpu": "0.4"}),
resources=V1ResourceRequirements(limits={"memory": "2Gi", "cpu": "0.8"}),
)
6 changes: 3 additions & 3 deletions sdk/python/test/e2e/test_e2e_pytorchjob.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,7 @@ def generate_pytorchjob(
def generate_container() -> V1Container:
return V1Container(
name=CONTAINER_NAME,
image="gcr.io/kubeflow-ci/pytorch-dist-mnist-test:v1.0",
args=["--backend", "gloo"],
resources=V1ResourceRequirements(limits={"memory": "1Gi", "cpu": "0.4"}),
image="kubeflow/pytorch-dist-mnist:latest",
args=["--backend", "gloo", "--epochs", "1"],
resources=V1ResourceRequirements(limits={"memory": "2Gi", "cpu": "0.8"}),
)
2 changes: 1 addition & 1 deletion sdk/python/test/e2e/test_e2e_tfjob.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,5 +164,5 @@ def generate_container() -> V1Container:
"--learning_rate=0.01",
"--batch_size=150",
],
resources=V1ResourceRequirements(limits={"memory": "2Gi", "cpu": "0.75"}),
resources=V1ResourceRequirements(limits={"memory": "4Gi", "cpu": "1.6"}),
)
2 changes: 1 addition & 1 deletion sdk/python/test/e2e/test_e2e_xgboostjob.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,5 +190,5 @@ def generate_container() -> V1Container:
"--model_path=/tmp/xgboost-model",
"--model_storage_type=local",
],
resources=V1ResourceRequirements(limits={"memory": "1Gi", "cpu": "0.4"}),
resources=V1ResourceRequirements(limits={"memory": "2Gi", "cpu": "0.8"}),
)