diff --git a/.github/workflows/H-Coverage.yml b/.github/workflows/H-Coverage.yml index 8c0396e9368362..4907983934dad6 100644 --- a/.github/workflows/H-Coverage.yml +++ b/.github/workflows/H-Coverage.yml @@ -157,7 +157,7 @@ jobs: source ${{ github.workspace }}/../../../proxy pip install -r python/requirements.txt mkdir build && cd build - cmake .. -DPY_VERSION=3.10 -DWITH_GPU=ON -DWITH_DISTRIBUTE=ON -DWITH_TESTING=ON -DCUDA_ARCH_NAME=Hopper -DFA_JOB_POOLS_COMPILE=1 -DWITH_CUDNN_FRONTEND=ON -DON_INFER=OFF + cmake .. -DPY_VERSION=3.10 -DWITH_GPU=ON -DWITH_DISTRIBUTE=ON -DWITH_TESTING=ON -DCUDA_ARCH_NAME=Manual -DCUDA_ARCH_BIN="90" -DFA_JOB_POOLS_COMPILE=1 -DWITH_CUDNN_FRONTEND=ON -DON_INFER=OFF -DWITH_NVSHMEM=ON make -j20 ' @@ -338,3 +338,218 @@ jobs: docker exec -t ${{ env.container_name }} /bin/bash -c 'rm -rf * .[^.]*' docker stop ${{ env.container_name }} docker rm ${{ env.container_name }} + + fleet_single_card_test: + name: Fleet Unit test (single card) + needs: [build] + if: needs.build.outputs.can-skip != 'true' + runs-on: + group: Fleet-H-single-card + env: + PIP_CACHE_DIR: /root/.cache/pip + CACHE_DIR: /root/.cache + TASK: paddle-fleet-CI-${{ github.event.pull_request.number }}-single-card-test + steps: + - name: Determine the runner + run: | + gpu_id=$(( $(echo $PWD | awk -F'/' '{print $3}' | awk -F'-' '{print $2}') + 3 )) + echo GPU_DEVICES="$gpu_id" >> $GITHUB_ENV + + - name: Check docker image and run container + env: + GPU_DEVICES: ${{ env.GPU_DEVICES }} + docker_image: "ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddle:cuda129-coverage-test" + run: | + container_name=${TASK}-$(date +%Y%m%d-%H%M%S) + echo "container_name=${container_name}" >> ${{ github.env }} + docker pull $docker_image + docker run -d -t --name ${container_name} --gpus "\"device=${GPU_DEVICES}\"" --shm-size=32G \ + -v "/dev/shm:/dev/shm" \ + -v ${{ github.workspace }}/../../..:${{ github.workspace }}/../../.. \ + -v ${{ github.workspace }}/../../../proxy:/root/proxy \ + -v /ssd1/paddle-1/action_cache:/root/.cache \ + -v ${{ github.workspace }}:/paddle \ + -e BRANCH \ + -e PR_ID \ + -e COMMIT_ID \ + -e PADDLE_ROOT \ + -e ci_scripts \ + -e CACHE_DIR \ + -e no_proxy \ + -e CI_name \ + -e PIP_CACHE_DIR \ + -e work_dir \ + -e GITHUB_SHA="${{ github.event.pull_request.head.sha }}" \ + -e GITHUB_HEAD_REF="${{ github.head_ref }}" \ + -e GITHUB_BASE_SHA="${{ github.event.pull_request.base.sha }}" \ + -e GITHUB_REPO_NAME="${{ github.repository }}" \ + -e GITHUB_EVENT_PULL_REQUEST_NUMBER="${{ github.event.pull_request.number }}" \ + -e GITHUB_TOKEN="${{ secrets.GITHUB_TOKEN }}" \ + -e GITHUB_RUN_ID="${{ github.run_id }}" \ + -w /paddle --network host ${docker_image} + + - name: Clone PaddleFleet + run: | + docker exec -t ${{ env.container_name }} /bin/bash -ce ' + rm -rf * .[^.]* + source /root/proxy + git clone https://github.com/PaddlePaddle/PaddleFleet.git . + git config --global --add safe.directory /paddle + git config user.name "PaddleCI" + git config user.email "paddle_ci@example.com" + git config pull.rebase false + mkdir -p /root/.cache/pip + pip cache dir + echo "Install uv" + pip install uv + echo "uv sync" + git submodule update --init --recursive + uv sync --group ci -v > /dev/null + ' + + - name: Download paddle.tar.gz and install paddle whl + run: | + docker exec -t ${{ env.container_name }} /bin/bash -c ' + set -e + mkdir -p /PaddlePaddle + cd /PaddlePaddle + echo "Downloading Paddle.tar.gz from cfs" + wget -q --tries=5 --no-proxy https://paddle-github-action.bj.bcebos.com/PR/h-coverage/${PR_ID}/${COMMIT_ID}/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl --no-check-certificate + source /root/proxy + source /paddle/.venv/bin/activate + export UV_SKIP_WHEEL_FILENAME_CHECK=1 #This environment variable allows installing the latest commit-level whl package of Paddle. + export UV_NO_SYNC=1 # This environment variable prevents uv sync from being executed when running un run. + export UV_HTTP_TIMEOUT=300 + uv pip install paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl --force-reinstall + ' + + - name: Single card test + run: | + docker exec -t ${{ env.container_name }} /bin/bash -xce ' + pwd + source .venv/bin/activate + export UV_SKIP_WHEEL_FILENAME_CHECK=1 #This environment variable allows installing the latest commit-level whl package of Paddle. + export UV_NO_SYNC=1 # This environment variable prevents uv sync from being executed when running un run. + export UV_HTTP_TIMEOUT=300 + python -c "import paddle; print(paddle.version.commit)" + bash ci/single_card_test.sh + single_card_exit_code=$? + if [[ "$single_card_exit_code" != "0" ]]; then + echo -e "::error:: \033[31mSingle card test failed.\033[0m" + exit 1 + else + echo -e "\033[32mSingle card test succeeded.\033[0m" + fi + ' + + - name: Terminate and delete the container + if: ${{ always() }} + run: | + set +e + docker exec -t ${{ env.container_name }} /bin/bash -c 'rm -rf * .[^.]*' + docker rm -f ${{ env.container_name }} + + fleet-multi-card_test: + name: Fleet Unit test (multi-card) + needs: [build] + if: needs.build.outputs.can-skip != 'true' + runs-on: + group: Fleet-H-multi-card + env: + PIP_CACHE_DIR: /root/.cache/pip + TASK: paddle-fleet-CI-${{ github.event.pull_request.number }}-multi-card_test + docker_image: "ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddle:cuda129-coverage-test" + steps: + - name: Check docker image and run container + run: | + container_name=${TASK}-$(date +%Y%m%d-%H%M%S) + echo "container_name=${container_name}" >> ${{ github.env }} + docker pull $docker_image + docker run -d -t --gpus all --name ${container_name} \ + -v "/dev/shm:/dev/shm" \ + -v ${{ github.workspace }}/../../..:${{ github.workspace }}/../../.. \ + -v ${{ github.workspace }}/../../../proxy:/root/proxy \ + -v ${{ github.workspace }}/../../../.cache:/root/.cache \ + -v ${{ github.workspace }}:/paddle \ + -e BRANCH \ + -e PR_ID \ + -e COMMIT_ID \ + -e PADDLE_ROOT \ + -e ci_scripts \ + -e CACHE_DIR \ + -e no_proxy \ + -e CI_name \ + -e PIP_CACHE_DIR \ + -e work_dir \ + -e GITHUB_SHA="${{ github.event.pull_request.head.sha }}" \ + -e GITHUB_HEAD_REF="${{ github.head_ref }}" \ + -e GITHUB_BASE_SHA="${{ github.event.pull_request.base.sha }}" \ + -e GITHUB_REPO_NAME="${{ github.repository }}" \ + -e GITHUB_EVENT_NAME="${{ github.event_name }}" \ + -e GITHUB_EVENT_PULL_REQUEST_NUMBER="${{ github.event.pull_request.number }}" \ + -e GITHUB_TOKEN="${{ secrets.GITHUB_TOKEN }}" \ + -e GITHUB_RUN_ID="${{ github.run_id }}" \ + -w /paddle --network host ${docker_image} + + - name: Clone PaddleFleet + run: | + docker exec -t ${{ env.container_name }} /bin/bash -ce ' + rm -rf * .[^.]* + source /root/proxy + git clone https://github.com/PaddlePaddle/PaddleFleet.git . + git config --global --add safe.directory /paddle + git config user.name "PaddleCI" + git config user.email "paddle_ci@example.com" + git config pull.rebase false + mkdir -p /root/.cache/pip + pip cache dir + echo "Install uv" + pip install uv + echo "uv sync" + git submodule update --init --recursive + uv sync --group ci -v > /dev/null + wget https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64 -O /usr/local/bin/yq + chmod +x /usr/local/bin/yq + ' + + - name: Download paddle.tar.gz and install paddle whl + run: | + docker exec -t ${{ env.container_name }} /bin/bash -c ' + set -e + mkdir -p /PaddlePaddle + cd /PaddlePaddle + echo "Downloading Paddle.tar.gz from cfs" + wget -q --tries=5 --no-proxy https://paddle-github-action.bj.bcebos.com/PR/h-coverage/${PR_ID}/${COMMIT_ID}/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl --no-check-certificate + source /root/proxy + source /paddle/.venv/bin/activate + export UV_SKIP_WHEEL_FILENAME_CHECK=1 #This environment variable allows installing the latest commit-level whl package of Paddle. + export UV_NO_SYNC=1 # This environment variable prevents uv sync from being executed when running un run. + export UV_HTTP_TIMEOUT=300 + uv pip install paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl --force-reinstall + ' + + - name: Multi-card test + run: | + docker exec -t ${{ env.container_name }} /bin/bash -ce ' + source /paddle/.venv/bin/activate + export PYTHONPATH=$(pwd) + python -c "import paddle; print(paddle.version.commit)" + export UV_SKIP_WHEEL_FILENAME_CHECK=1 #This environment variable allows installing the latest commit-level whl package of Paddle. + export UV_NO_SYNC=1 # This environment variable prevents uv sync from being executed when running un run. + export UV_HTTP_TIMEOUT=300 + bash ci/multi-card_test.sh + multi_card_exit_code=$? + if [[ "$multi_card_exit_code" != "0" ]]; then + echo -e "::error:: \033[31mMulti card test failed.\033[0m" + exit 1 + else + echo -e "\033[32mMulti card test succeeded.\033[0m" + fi + ' + + - name: Terminate and delete the container + if: ${{ always() }} + run: | + set +e + docker exec -t ${{ env.container_name }} /bin/bash -c 'rm -rf * .[^.]*' + docker rm -f ${{ env.container_name }} diff --git a/paddle/fluid/distributed/collective/deep_ep/CMakeLists.txt b/paddle/fluid/distributed/collective/deep_ep/CMakeLists.txt index d02f291d3d6501..1eb35a17f2e8d2 100644 --- a/paddle/fluid/distributed/collective/deep_ep/CMakeLists.txt +++ b/paddle/fluid/distributed/collective/deep_ep/CMakeLists.txt @@ -17,7 +17,7 @@ if(WITH_NVSHMEM) cc_library( deepep_kernels SRCS ${DEEPEP_KERNEL_SRCS} - DEPS nvshmem cudadevrt) + DEPS nvshmem cudadevrt onednn) set_target_properties(deepep_kernels PROPERTIES CUDA_SEPARABLE_COMPILATION ON) set_target_properties(deepep_kernels PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS