E2E: Replace outdated images with latest ones

Signed-off-by: Yuki Iwai <yuki.iwai.tz@gmail.com>
kubeflow · google-oss-prow · Apr 25, 2024 · Apr 25, 2024 · Apr 25, 2024 · a798f4793f260b42d3f8e5bee80453d17ce32fb7
commit a798f4793f260b42d3f8e5bee80453d17ce32fb7
diff --git a/.github/workflows/build-and-publish-images.yaml b/.github/workflows/build-and-publish-images.yaml
@@ -31,6 +31,9 @@ jobs:
       - name: Checkout
         uses: actions/checkout@v3
 
+      - name: Free-Up Disk Space
+        uses: ./.github/workflows/free-up-disk-space
+
       - name: Docker Login
         # Trigger workflow only for kubeflow/training-operator repository with specific branch (master, v.*-branch) or tag (v.*).
         if: >-

diff --git a/.github/workflows/free-up-disk-space/action.yaml b/.github/workflows/free-up-disk-space/action.yaml
@@ -0,0 +1,49 @@
+name: Free-Up Disk Space
+description: Remove Non-Essential Tools And Move Docker Data Directory to /mnt/docker
+
+runs:
+  using: composite
+  steps:
+    # This step is a Workaround to avoid the "No space left on device" error.
+    # ref: https://github.com/actions/runner-images/issues/2840
+    - name: Remove unnecessary files
+      shell: bash
+      run: |
+        echo "Disk usage before cleanup:"
+        df -hT
+
+        sudo rm -rf /usr/share/dotnet
+        sudo rm -rf /opt/ghc
+        sudo rm -rf /usr/local/share/boost
+        sudo rm -rf "$AGENT_TOOLSDIRECTORY"
+        sudo rm -rf /usr/local/lib/android
+        sudo rm -rf /usr/local/share/powershell
+        sudo rm -rf /usr/share/swift
+
+        echo "Disk usage after cleanup:"
+        df -hT
+
+    - name: Prune docker images
+      shell: bash
+      run: |
+        docker image prune -a -f
+        docker system df
+        df -hT
+
+    - name: Move docker data directory
+      shell: bash
+      run: |
+        echo "Stopping docker service ..."
+        sudo systemctl stop docker
+        DOCKER_DEFAULT_ROOT_DIR=/var/lib/docker
+        DOCKER_ROOT_DIR=/mnt/docker
+        echo "Moving ${DOCKER_DEFAULT_ROOT_DIR} -> ${DOCKER_ROOT_DIR}"
+        sudo mv ${DOCKER_DEFAULT_ROOT_DIR} ${DOCKER_ROOT_DIR}
+        echo "Creating symlink ${DOCKER_DEFAULT_ROOT_DIR} -> ${DOCKER_ROOT_DIR}"
+        sudo ln -s ${DOCKER_ROOT_DIR} ${DOCKER_DEFAULT_ROOT_DIR}
+        echo "$(sudo ls -l ${DOCKER_DEFAULT_ROOT_DIR})"
+        echo "Starting docker service ..."
+        sudo systemctl daemon-reload
+        sudo systemctl start docker
+        echo "Docker service status:"
+        sudo systemctl --no-pager -l -o short status docker
diff --git a/.github/workflows/integration-tests.yaml b/.github/workflows/integration-tests.yaml
@@ -55,25 +55,12 @@ jobs:
             python-version: "3.10"
 
     steps:
-      # This step is a Workaround to avoid the "No space left on device" error.
-      # ref: https://github.com/actions/runner-images/issues/2840
-      - name: Remove unnecessary files
-        shell: bash
-        run: |
-          sudo rm -rf /usr/share/dotnet
-          sudo rm -rf /opt/ghc
-          sudo rm -rf "/usr/local/share/boost"
-          sudo rm -rf "$AGENT_TOOLSDIRECTORY"
-          sudo rm -rf /usr/local/lib/android
-          sudo rm -rf /usr/local/share/powershell
-          sudo rm -rf /usr/share/swift
-
-          echo "Disk usage after cleanup:"
-          df -h
-
       - name: Checkout
         uses: actions/checkout@v3
 
+      - name: Free-Up Disk Space
+        uses: ./.github/workflows/free-up-disk-space
+
       - name: Setup Python
         uses: actions/setup-python@v4
         with:

diff --git a/.github/workflows/template-publish-image/action.yaml b/.github/workflows/template-publish-image/action.yaml
@@ -23,50 +23,6 @@ inputs:
 runs:
   using: composite
   steps:
-    # This step is a Workaround to avoid the "No space left on device" error.
-    # ref: https://github.com/actions/runner-images/issues/2840
-    - name: Remove unnecessary files
-      shell: bash
-      run: |
-        echo "Disk usage before cleanup:"
-        df -hT
-
-        sudo rm -rf /usr/share/dotnet
-        sudo rm -rf /opt/ghc
-        sudo rm -rf /usr/local/share/boost
-        sudo rm -rf "$AGENT_TOOLSDIRECTORY"
-        sudo rm -rf /usr/local/lib/android
-        sudo rm -rf /usr/local/share/powershell
-        sudo rm -rf /usr/share/swift
-
-        echo "Disk usage after cleanup:"
-        df -hT
-
-    - name: Prune docker images
-      shell: bash
-      run: |
-        docker image prune -a -f
-        docker system df
-        df -hT
-
-    - name: Move docker data directory
-      shell: bash
-      run: |
-        echo "Stopping docker service ..."
-        sudo systemctl stop docker
-        DOCKER_DEFAULT_ROOT_DIR=/var/lib/docker
-        DOCKER_ROOT_DIR=/mnt/docker
-        echo "Moving ${DOCKER_DEFAULT_ROOT_DIR} -> ${DOCKER_ROOT_DIR}"
-        sudo mv ${DOCKER_DEFAULT_ROOT_DIR} ${DOCKER_ROOT_DIR}
-        echo "Creating symlink ${DOCKER_DEFAULT_ROOT_DIR} -> ${DOCKER_ROOT_DIR}"
-        sudo ln -s ${DOCKER_ROOT_DIR} ${DOCKER_DEFAULT_ROOT_DIR}
-        echo "$(sudo ls -l ${DOCKER_DEFAULT_ROOT_DIR})"
-        echo "Starting docker service ..."
-        sudo systemctl daemon-reload
-        sudo systemctl start docker
-        echo "Docker service status:"
-        sudo systemctl --no-pager -l -o short status docker
-
     - name: Setup QEMU
       uses: docker/setup-qemu-action@v2
       with:

diff --git a/examples/mpi/tensorflow-mnist.yaml b/examples/mpi/tensorflow-mnist.yaml
@@ -12,7 +12,7 @@ spec:
       template:
         spec:
           containers:
-          - image: horovod/horovod:0.20.0-tf2.3.0-torch1.6.0-mxnet1.5.0-py3.7-cpu
+          - image: horovod/horovod:0.28.1
             name: mpi
             command:
             - mpirun
@@ -35,7 +35,7 @@ spec:
             - btl
             - ^openib
             - python
-            - /examples/tensorflow2_mnist.py
+            - /horovod/examples/tensorflow2/tensorflow2_mnist.py
             resources:
               limits:
                 cpu: 1
@@ -45,7 +45,7 @@ spec:
       template:
         spec:
           containers:
-          - image: horovod/horovod:0.20.0-tf2.3.0-torch1.6.0-mxnet1.5.0-py3.7-cpu
+          - image: horovod/horovod:0.28.1
             name: mpi
             resources:
               limits:

diff --git a/examples/pytorch/mnist/v1/pytorch_job_mnist_gloo.yaml b/examples/pytorch/mnist/v1/pytorch_job_mnist_gloo.yaml
@@ -11,10 +11,10 @@ spec:
         spec:
           containers:
             - name: pytorch
-              image: gcr.io/<your_project>/pytorch_dist_mnist:latest
+              image: kubeflow/pytorch-dist-mnist:latest
               args: ["--backend", "gloo"]
               # Comment out the below resources to use the CPU.
-              resources: 
+              resources:
                 limits:
                   nvidia.com/gpu: 1
     Worker:
@@ -24,9 +24,9 @@ spec:
         spec:
           containers: 
             - name: pytorch
-              image: gcr.io/<your_project>/pytorch_dist_mnist:latest
+              image: kubeflow/pytorch-dist-mnist:latest
               args: ["--backend", "gloo"]
               # Comment out the below resources to use the CPU.
-              resources: 
+              resources:
                 limits:
                   nvidia.com/gpu: 1
diff --git a/examples/pytorch/mnist/v1/pytorch_job_mnist_mpi.yaml b/examples/pytorch/mnist/v1/pytorch_job_mnist_mpi.yaml
@@ -11,7 +11,7 @@ spec:
         spec:
           containers:
             - name: pytorch
-              image: gcr.io/<your_project>/pytorch_dist_mnist:mpi
+              image: kubeflow/pytorch-dist-mnist:latest
               args: ["--backend", "mpi"]
               # Comment out the below resources to use the CPU.
               resources: 
@@ -24,7 +24,7 @@ spec:
         spec:
           containers: 
             - name: pytorch
-              image: gcr.io/<your_project>/pytorch_dist_mnist:mpi
+              image: kubeflow/pytorch-dist-mnist:latest
               args: ["--backend", "mpi"]
               # Comment out the below resources to use the CPU.
               resources: 

diff --git a/examples/pytorch/mnist/v1/pytorch_job_mnist_nccl.yaml b/examples/pytorch/mnist/v1/pytorch_job_mnist_nccl.yaml
@@ -11,7 +11,7 @@ spec:
         spec:
           containers:
             - name: pytorch
-              image: gcr.io/<your_project>/pytorch_dist_mnist:latest
+              image: kubeflow/pytorch-dist-mnist:latest
               args: ["--backend", "nccl"]
               resources: 
                 limits:
@@ -23,7 +23,7 @@ spec:
         spec:
           containers: 
             - name: pytorch
-              image: gcr.io/<your_project>/pytorch_dist_mnist:latest
+              image: kubeflow/pytorch-dist-mnist:latest
               args: ["--backend", "nccl"]
               resources: 
                 limits:

diff --git a/sdk/python/test/e2e/test_e2e_mpijob.py b/sdk/python/test/e2e/test_e2e_mpijob.py
@@ -39,7 +39,7 @@
 logging.getLogger("kubeflow.training.api.training_client").setLevel(logging.DEBUG)
 
 TRAINING_CLIENT = TrainingClient(job_kind=constants.MPIJOB_KIND)
-JOB_NAME = "mpijob-mxnet-ci-test"
+JOB_NAME = "mpijob-pytorch-ci-test"
 CONTAINER_NAME = "mpi"
 GANG_SCHEDULER_NAME = os.getenv(TEST_GANG_SCHEDULER_NAME_ENV_KEY, "")
 
@@ -182,7 +182,7 @@ def generate_mpijob(
 def generate_containers() -> Tuple[V1Container, V1Container]:
     launcher_container = V1Container(
         name=CONTAINER_NAME,
-        image="horovod/horovod:0.20.0-tf2.3.0-torch1.6.0-mxnet1.5.0-py3.7-cpu",
+        image="horovod/horovod:0.28.1",
         command=["mpirun"],
         args=[
             "-np",
@@ -202,19 +202,18 @@ def generate_containers() -> Tuple[V1Container, V1Container]:
             "-mca",
             "btl",
             "^openib",
-            # "python", "/examples/tensorflow2_mnist.py"]
             "python",
-            "/examples/pytorch_mnist.py",
+            "/horovod/examples/pytorch/pytorch_mnist.py",
             "--epochs",
             "1",
         ],
         resources=V1ResourceRequirements(limits={"memory": "1Gi", "cpu": "0.4"}),
     )
 
     worker_container = V1Container(
-        name="mpi",
-        image="horovod/horovod:0.20.0-tf2.3.0-torch1.6.0-mxnet1.5.0-py3.7-cpu",
-        resources=V1ResourceRequirements(limits={"memory": "1Gi", "cpu": "0.4"}),
+        name=CONTAINER_NAME,
+        image="horovod/horovod:0.28.1",
+        resources=V1ResourceRequirements(limits={"memory": "3Gi", "cpu": "1.2"}),
     )
 
     return launcher_container, worker_container
diff --git a/sdk/python/test/e2e/test_e2e_mxjob.py b/sdk/python/test/e2e/test_e2e_mxjob.py
@@ -233,21 +233,21 @@ def generate_containers() -> Tuple[V1Container, V1Container, V1Container]:
             "dist_sync",
         ],
         ports=[V1ContainerPort(container_port=9991, name="mxjob-port")],
-        resources=V1ResourceRequirements(limits={"memory": "1Gi", "cpu": "0.25"}),
+        resources=V1ResourceRequirements(limits={"memory": "2Gi", "cpu": "0.8"}),
     )
 
     server_container = V1Container(
         name=CONTAINER_NAME,
         image="docker.io/kubeflow/mxnet-gpu:latest",
         ports=[V1ContainerPort(container_port=9991, name="mxjob-port")],
-        resources=V1ResourceRequirements(limits={"memory": "1Gi", "cpu": "0.25"}),
+        resources=V1ResourceRequirements(limits={"memory": "1Gi", "cpu": "0.4"}),
     )
 
     scheduler_container = V1Container(
         name=CONTAINER_NAME,
         image="docker.io/kubeflow/mxnet-gpu:latest",
         ports=[V1ContainerPort(container_port=9991, name="mxjob-port")],
-        resources=V1ResourceRequirements(limits={"memory": "1Gi", "cpu": "0.25"}),
+        resources=V1ResourceRequirements(limits={"memory": "1Gi", "cpu": "0.4"}),
     )
 
     return worker_container, server_container, scheduler_container
diff --git a/sdk/python/test/e2e/test_e2e_paddlejob.py b/sdk/python/test/e2e/test_e2e_paddlejob.py
@@ -158,5 +158,5 @@ def generate_container() -> V1Container:
         image="docker.io/paddlepaddle/paddle:2.4.0rc0-cpu",
         command=["python"],
         args=["-m", "paddle.distributed.launch", "run_check"],
-        resources=V1ResourceRequirements(limits={"memory": "1Gi", "cpu": "0.4"}),
+        resources=V1ResourceRequirements(limits={"memory": "2Gi", "cpu": "0.8"}),
     )
diff --git a/sdk/python/test/e2e/test_e2e_pytorchjob.py b/sdk/python/test/e2e/test_e2e_pytorchjob.py
@@ -264,7 +264,7 @@ def generate_pytorchjob(
 def generate_container() -> V1Container:
     return V1Container(
         name=CONTAINER_NAME,
-        image="gcr.io/kubeflow-ci/pytorch-dist-mnist-test:v1.0",
-        args=["--backend", "gloo"],
-        resources=V1ResourceRequirements(limits={"memory": "1Gi", "cpu": "0.4"}),
+        image="kubeflow/pytorch-dist-mnist:latest",
+        args=["--backend", "gloo", "--epochs", "1"],
+        resources=V1ResourceRequirements(limits={"memory": "2Gi", "cpu": "0.8"}),
     )
diff --git a/sdk/python/test/e2e/test_e2e_tfjob.py b/sdk/python/test/e2e/test_e2e_tfjob.py
@@ -164,5 +164,5 @@ def generate_container() -> V1Container:
             "--learning_rate=0.01",
             "--batch_size=150",
         ],
-        resources=V1ResourceRequirements(limits={"memory": "2Gi", "cpu": "0.75"}),
+        resources=V1ResourceRequirements(limits={"memory": "4Gi", "cpu": "1.6"}),
     )
diff --git a/sdk/python/test/e2e/test_e2e_xgboostjob.py b/sdk/python/test/e2e/test_e2e_xgboostjob.py
@@ -190,5 +190,5 @@ def generate_container() -> V1Container:
             "--model_path=/tmp/xgboost-model",
             "--model_storage_type=local",
         ],
-        resources=V1ResourceRequirements(limits={"memory": "1Gi", "cpu": "0.4"}),
+        resources=V1ResourceRequirements(limits={"memory": "2Gi", "cpu": "0.8"}),
     )