diff --git a/.archive.mk b/.archive.mk index 0d140d9..6cc2975 100755 --- a/.archive.mk +++ b/.archive.mk @@ -3,16 +3,16 @@ # Generates a tar from all top-level directory in this current folder, without hidden files # -Andreas Herten, 2021 April 27 .PHONY: all -all: tut138-multi-gpu.tar.gz +all: tut147s1-multi-gpu.tar.gz -SOURCES=$(shell gfind . -maxdepth 1 -mindepth 1 -not -path "./.*" -not -name "tut138-multi-gpu.tar.gz" -printf '%P\n' | sort -h) +SOURCES=$(shell gfind . -maxdepth 1 -mindepth 1 -not -path "./.*" -not -name "tut147s1-multi-gpu.tar.gz" -printf '%P\n' | sort -h) -tut138-multi-gpu.tar.gz: $(shell find . -not -name "tut138-multi-gpu.tar.gz") +tut147s1-multi-gpu.tar.gz: $(shell find . -not -name "tut147s1-multi-gpu.tar.gz") # if ! grep -q "Please check Github"; then \ sed -i '1 i***Please check GitHub repo for latest version of slides: https://github.com/FZJ-JSC/tutorial-multi-gpu/ ***\n' README.md; \ fi; sed -i '1 i***Please check GitHub repo for latest version of slides: https://github.com/FZJ-JSC/tutorial-multi-gpu/ ***\n' README.md - tar czf $@ --transform 's,^,SC21-tut138-Multi-GPU/,' --exclude=".*" $(SOURCES) + tar czf $@ --transform 's,^,ISC22-tut147s1-Multi-GPU/,' --exclude=".*" $(SOURCES) # if grep -q "Please check Github"; then \ sed -i '2d' README.md; \ fi diff --git a/.etc/.set-facl-permissions.sh b/.etc/.set-facl-permissions.sh index 72c8885..1bc8de4 100644 --- a/.etc/.set-facl-permissions.sh +++ b/.etc/.set-facl-permissions.sh @@ -3,8 +3,8 @@ set -x for user in haghighimood1 kraus1 hrywniak1 oden1 garciadegonzalo1; do - setfacl -m u:$user:rwx -R $PROJECT_training2124/common/ - setfacl -m u:$user:rwx -R $PROJECT_training2124/env.sh + setfacl -m u:$user:rwx -R $PROJECT_training2216/common/ + setfacl -m u:$user:rwx -R $PROJECT_training2216/env.sh done set +x \ No newline at end of file diff --git a/.etc/deploy-material.sh b/.etc/deploy-material.sh index 77ea069..07e18dd 100755 --- a/.etc/deploy-material.sh +++ b/.etc/deploy-material.sh @@ -1 +1 @@ -rsync --archive --exclude=".*" --verbose ../ judac:/p/project/training2125/common/material/ +rsync --archive --exclude=".*" --verbose ../ judac:/p/project/training2216/common/material/ diff --git a/.etc/deploy.sh b/.etc/deploy.sh index 6d0ff18..55eb7c6 100755 --- a/.etc/deploy.sh +++ b/.etc/deploy.sh @@ -1 +1 @@ -rsync --archive --exclude="deploy.sh" --verbose . judac:/p/project/training2125/common/environment/ +rsync --archive --exclude="deploy.sh" --verbose . judac:/p/project/training2216/common/environment/ diff --git a/.etc/instructions-header.md b/.etc/instructions-header.md new file mode 100644 index 0000000..db1c36f --- /dev/null +++ b/.etc/instructions-header.md @@ -0,0 +1,7 @@ +# ISC22 Tutorial: Efficient Distributed GPU Programming for Exascale + +- Time: Sunday, 29 May 2022 9AM - 6PM CEST +- Location: Hall Y6, Congress Center Hamburg (CCH) +- Program Link: + https://app.swapcard.com/widget/event/isc-high-performance-2022/planning/UGxhbm5pbmdfODYxMTQ2 + diff --git a/.etc/instructions-update-header.sh b/.etc/instructions-update-header.sh new file mode 100644 index 0000000..5771cac --- /dev/null +++ b/.etc/instructions-update-header.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash + +for f in $(fd -H "Instructions.md" ../); do + cat <(cat instructions-header.md) <(sed -n -e '/## /,$p' $f) | sponge $f +done + +cwd=$(pwd) +for f in $(fd -H "copy.mk" ../); do + cd $(dirname $f) + ./copy.mk + cd $cwd +done \ No newline at end of file diff --git a/.etc/jsccourse-bashrc.sh b/.etc/jsccourse-bashrc.sh index d00db7b..491c5ee 100644 --- a/.etc/jsccourse-bashrc.sh +++ b/.etc/jsccourse-bashrc.sh @@ -1,8 +1,20 @@ +################################################ +# JSC Course bashrc +# +# This file is usually called "env.sh" and is to be loaded as the very first part of a course; it setups environment variables and commands which are relied upon in the course. +# +# There are a number of opportunities to steer variables in this script from the outside. +# * $JSCCOURSE_DIR_LOCAL_BASE: If this variable is set, it will be used within the target to rsync the material to. In this folder, the course folder will be created. It defaults to $HOME +# * $partition: If this variable is set, it will be used to specify the partition to run on. It has a system-specific default +# * $_JSCCOURSE_OVERRIDE_LOCALE: If this variable is set (to anything), this env.sh will not set all locale stuff to en_US.UTF-8 +# +# Andreas Herten, >2017 +################################################ if [ -z "$_JSCCOURSE_ENV_SOURCED" ]; then - project="training2125" + project="training2216" export JSCCOURSE_DIR_GROUP=/p/project/$project - export JSCCOURSE_DIR_LOCAL=$HOME/SC21-Multi-GPU-Tutorial + export JSCCOURSE_DIR_LOCAL=${JSCCOURSE_DIR_LOCAL_BASE:-$HOME}/ISC22-Multi-GPU-Tutorial export _JSCCOURSE_ENV_SOURCED="$(date)" export C_V_D="0,1,2,3" @@ -11,10 +23,8 @@ if [ -z "$_JSCCOURSE_ENV_SOURCED" ]; then res="" currentday=$(date +%d) - if [[ "$currentday" == "14" ]]; then - res="--reservation multi-gpu-tutorial-2021-11-14" - elif [[ "$currentday" == "15" ]]; then - res="--reservation multi-gpu-tutorial-2021-11-14" + if [[ "$currentday" == "29" ]]; then + res="--reservation multi-gpu-tutorial-2022-05-29" fi export SLURM_NTASKS=1 @@ -28,11 +38,17 @@ if [ -z "$_JSCCOURSE_ENV_SOURCED" ]; then export NP=2 export PSP_CUDA_ENFORCE_STAGING=1 JSC_SUBMIT_CMD_SYSTEM_SPECIFIC_OPTIONS="--ntasks-per-node 1" - partition=gpus + partition=${partition:-gpus} ;; - juwels|juwelsbooster|jureca) + juwels|juwelsbooster) ngpus=4 export NP=4 + partition=${partition:-booster} + ;; + jurecadc) + ngpus=4 + export NP=4 + partition=${partition:-dc-gpu} ;; *) echo "This system is not yet tested, setting ngpus=4" @@ -40,7 +56,7 @@ if [ -z "$_JSCCOURSE_ENV_SOURCED" ]; then ;; esac - export JSC_BATCH_CONFIG="$res --partition ${partition:-booster} --gres=gpu:$ngpus $JSC_SUBMIT_CMD_SYSTEM_SPECIFIC_OPTIONS --time 0:10:00" + export JSC_BATCH_CONFIG="$res --partition ${partition} --cpu-bind=sockets --gres=gpu:$ngpus $JSC_SUBMIT_CMD_SYSTEM_SPECIFIC_OPTIONS --time 0:10:00" export JSC_ALLOC_CMD="salloc $JSC_BATCH_CONFIG" export JSC_SUBMIT_CMD="srun $JSC_BATCH_CONFIG --pty" @@ -50,6 +66,12 @@ if [ -z "$_JSCCOURSE_ENV_SOURCED" ]; then export PS1="\[\033[0;34m\]Ⓒ\[\033[0m\] $PS1" + if [[ -z "${_JSCCOURSE_OVERRIDE_LOCALE}" ]]; then + export LC_ALL=en_US.UTF-8 + export LANG=en_US.UTF-8 + export LANGUAGE=en_US.UTF-8 + fi + # export UCX_WARN_UNUSED_ENV_VARS=n # User specific aliases and functions @@ -86,7 +108,7 @@ if [[ $- =~ "i" ]]; then echo "" echo "*******************************************************************************" - echo " Welcome to the SC21 Tutorial on Multi-GPU Computing for Exascale! " + echo " Welcome to the ISC22 Tutorial on Multi-GPU Computing for Exascale! " # echo " A default call to get a batch system allocation is stored in \$JSC_ALLOC_CMD!" # echo " Use it with \`eval \$JSC_ALLOC_CMD\`. The value of \$JSC_ALLOC_CMD is:" # echo -n " " diff --git a/.etc/modules.sh b/.etc/modules.sh index 4e0cc01..b945085 100644 --- a/.etc/modules.sh +++ b/.etc/modules.sh @@ -1,10 +1,10 @@ module use $OTHERSTAGES module purge -module load Stages/2020 -module load GCC/10.3.0 -module load CUDA/11.3 -module load ParaStationMPI/5.4.10-1 -module load NVSHMEM/2.2.1 -module load NCCL/2.10.3-1-CUDA-11.3 -module load Nsight-Systems/2021.4.1 +module load Stages/2022 +module load GCC/11.2.0 +module load CUDA/11.5 +module load ParaStationMPI/5.5.0-1 +module load NVSHMEM/2.5.0 +module load NCCL/2.12.7-1-CUDA-11.5 +module load Nsight-Systems/2022.2.1 # module use $JSCCOURSE_DIR_GROUP/common/modulefiles \ No newline at end of file diff --git a/.gitignore b/.gitignore index f40d759..f3057c8 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1 @@ -tut138-multi-gpu.tar.gz +tut147s1-multi-gpu.tar.gz diff --git a/.zenodo.json b/.zenodo.json index fb8d225..e012d96 100644 --- a/.zenodo.json +++ b/.zenodo.json @@ -29,19 +29,19 @@ "title": "Efficient Distributed GPU Programming for Exascale", - "publication_date": "2021-11-14", + "publication_date": "2022-05-29", - "description": "
Over the past years, GPUs became ubiquitous in HPC installations around the world. Today, they provide the majority of performance of some of the largest supercomputers (e.g. Summit, Sierra, JUWELS Booster). This trend continues in upcoming pre-exascale and exascale systems (LUMI, Leonardo; Frontier): GPUs are chosen as the core computing devices to enter this next era of HPC.
To take advantage of future GPU-accelerated systems with tens of thousands of devices, application developers need to have the proper skills and tools to understand, manage, and optimize distributed GPU applications.
In this tutorial, participants will learn techniques to efficiently program large-scale multi-GPU systems. While programming multiple GPUs with MPI is explained in detail, also advanced techniques and models (NCCL, NVSHMEM, …) are presented. Tools for analysis are used to motivate implementation of performance optimizations. The tutorial combines lectures and hands-on exercises, using Europe’s fastest supercomputer, JUWELS Booster with NVIDIA A100 GPUs.
", + "description": "Over the past years, GPUs became ubiquitous in HPC installations around the world. Today, they provide the majority of performance of some of the largest supercomputers (e.g. Summit, Sierra, JUWELS Booster). This trend continues in the pre-exascale and exascale systems (LUMI, Leonardo; Perlmutter, Frontier): GPUs are chosen as the core computing devices to enter this next era of HPC.
To take advantage of future GPU-accelerated systems with tens of thousands of devices, application developers need to have the propers skills and tools to understand, manage, and optimize distributed GPU applications. In this tutorial, participants will learn techniques to efficiently program large-scale multi-GPU systems. While programming multiple GPUs with MPI is explained in detail, advanced tuning techniques and complementary programming models like NCCL and NVSHMEM are presented as well. Tools for analysis are shown and used to motivate and implement performance optimizations. The tutorial is a combination of lectures and hands-on exercises, using Europe’s fastest supercomputer, JUWELS Booster with NVIDIA GPUs, for interactive learning and discovery.
", - "notes": "Slides and exercises of tutorial presented virtually at SC21 (International Conference for High Performance Computing, Networking, Storage, and Analysis); https://sc21.supercomputing.org/presentation/?id=tut138&sess=sess188", + "notes": "Slides and exercises of tutorial presented virtually at ISC22 (ISC High Performance 2022); https://app.swapcard.com/widget/event/isc-high-performance-2022/planning/UGxhbm5pbmdfODYxMTQ2", "access_right": "open", - "conference_title": "Supercomputing Conference 2021", - "conference_acronym": "SC21", - "conference_dates": "14-19 November 2021", - "conference_place": "St. Louis, MO, USA and virtual", - "conference_url": "https://sc21.supercomputing.org/", + "conference_title": "ISC HPC 2022", + "conference_acronym": "ISC22", + "conference_dates": "29 May-02 June 2022", + "conference_place": "Hamburg, Germany", + "conference_url": "https://www.isc-hpc.com/", "conference_session": "Tutorials", "conference_session_part": "Day 1", diff --git a/01-L_Introduction_Overview/slides.pdf b/01-L_Introduction_Overview/slides.pdf index 402b219..5a5302a 100644 Binary files a/01-L_Introduction_Overview/slides.pdf and b/01-L_Introduction_Overview/slides.pdf differ diff --git a/01b-H_Onboarding/slides.pdf b/01b-H_Onboarding/slides.pdf index a400409..299d103 100644 Binary files a/01b-H_Onboarding/slides.pdf and b/01b-H_Onboarding/slides.pdf differ diff --git a/02-L_Introduction_to_MPI-Distributed_Computing_with_GPUs/slides.pdf b/02-L_Introduction_to_MPI-Distributed_Computing_with_GPUs/slides.pdf index f086a95..f28aedb 100644 Binary files a/02-L_Introduction_to_MPI-Distributed_Computing_with_GPUs/slides.pdf and b/02-L_Introduction_to_MPI-Distributed_Computing_with_GPUs/slides.pdf differ diff --git a/03-H_Multi_GPU_Parallelization/.master/Instructions.ipynb b/03-H_Multi_GPU_Parallelization/.master/Instructions.ipynb index fb5b4ee..0bfdfab 100644 --- a/03-H_Multi_GPU_Parallelization/.master/Instructions.ipynb +++ b/03-H_Multi_GPU_Parallelization/.master/Instructions.ipynb @@ -4,12 +4,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# SC21 Tutorial: Efficient Distributed GPU Programming for Exascale\n", + "# ISC22 Tutorial: Efficient Distributed GPU Programming for Exascale\n", "\n", - "- Time: Sunday, 14 November 2021 8AM - 5PM CST\n", - "- Location: *online*\n", + "- Time: Sunday, 29 May 2022 9AM - 6PM CEST\n", + "- Location: Hall Y6, Congress Center Hamburg (CCH)\n", "- Program Link:\n", - " https://sc21.supercomputing.org/presentation/?id=tut138&sess=sess188\n", + " https://app.swapcard.com/widget/event/isc-high-performance-2022/planning/UGxhbm5pbmdfODYxMTQ2\n", "\n", "## Hands-On 3: Multi-GPU Parallelization with CUDA-aware MPI\n", "\n", @@ -70,7 +70,8 @@ "- Compute how many processes get (ny - 2) / size resp (ny - 2) /\n", " size + 1 rows\n", "- Adapt the computation of (`iy_start_global`)" - ] + ], + "id": "6307a649-048f-4dd3-82ee-863dfcfba33b" } ], "nbformat": 4, diff --git a/03-H_Multi_GPU_Parallelization/.master/Instructions.md b/03-H_Multi_GPU_Parallelization/.master/Instructions.md index 2600a58..f9794b2 100644 --- a/03-H_Multi_GPU_Parallelization/.master/Instructions.md +++ b/03-H_Multi_GPU_Parallelization/.master/Instructions.md @@ -1,9 +1,9 @@ -# SC21 Tutorial: Efficient Distributed GPU Programming for Exascale - -- Time: Sunday, 14 November 2021 8AM - 5PM CST -- Location: *online* -- Program Link: https://sc21.supercomputing.org/presentation/?id=tut138&sess=sess188 +# ISC22 Tutorial: Efficient Distributed GPU Programming for Exascale +- Time: Sunday, 29 May 2022 9AM - 6PM CEST +- Location: Hall Y6, Congress Center Hamburg (CCH) +- Program Link: + https://app.swapcard.com/widget/event/isc-high-performance-2022/planning/UGxhbm5pbmdfODYxMTQ2 ## Hands-On 3: Multi-GPU Parallelization with CUDA-aware MPI diff --git a/03-H_Multi_GPU_Parallelization/solutions/Instructions.ipynb b/03-H_Multi_GPU_Parallelization/solutions/Instructions.ipynb index fb5b4ee..0bfdfab 100644 --- a/03-H_Multi_GPU_Parallelization/solutions/Instructions.ipynb +++ b/03-H_Multi_GPU_Parallelization/solutions/Instructions.ipynb @@ -4,12 +4,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# SC21 Tutorial: Efficient Distributed GPU Programming for Exascale\n", + "# ISC22 Tutorial: Efficient Distributed GPU Programming for Exascale\n", "\n", - "- Time: Sunday, 14 November 2021 8AM - 5PM CST\n", - "- Location: *online*\n", + "- Time: Sunday, 29 May 2022 9AM - 6PM CEST\n", + "- Location: Hall Y6, Congress Center Hamburg (CCH)\n", "- Program Link:\n", - " https://sc21.supercomputing.org/presentation/?id=tut138&sess=sess188\n", + " https://app.swapcard.com/widget/event/isc-high-performance-2022/planning/UGxhbm5pbmdfODYxMTQ2\n", "\n", "## Hands-On 3: Multi-GPU Parallelization with CUDA-aware MPI\n", "\n", @@ -70,7 +70,8 @@ "- Compute how many processes get (ny - 2) / size resp (ny - 2) /\n", " size + 1 rows\n", "- Adapt the computation of (`iy_start_global`)" - ] + ], + "id": "6307a649-048f-4dd3-82ee-863dfcfba33b" } ], "nbformat": 4, diff --git a/03-H_Multi_GPU_Parallelization/solutions/Instructions.md b/03-H_Multi_GPU_Parallelization/solutions/Instructions.md index 2600a58..f9794b2 100644 --- a/03-H_Multi_GPU_Parallelization/solutions/Instructions.md +++ b/03-H_Multi_GPU_Parallelization/solutions/Instructions.md @@ -1,9 +1,9 @@ -# SC21 Tutorial: Efficient Distributed GPU Programming for Exascale - -- Time: Sunday, 14 November 2021 8AM - 5PM CST -- Location: *online* -- Program Link: https://sc21.supercomputing.org/presentation/?id=tut138&sess=sess188 +# ISC22 Tutorial: Efficient Distributed GPU Programming for Exascale +- Time: Sunday, 29 May 2022 9AM - 6PM CEST +- Location: Hall Y6, Congress Center Hamburg (CCH) +- Program Link: + https://app.swapcard.com/widget/event/isc-high-performance-2022/planning/UGxhbm5pbmdfODYxMTQ2 ## Hands-On 3: Multi-GPU Parallelization with CUDA-aware MPI diff --git a/03-H_Multi_GPU_Parallelization/solutions/advanced/Instructions.ipynb b/03-H_Multi_GPU_Parallelization/solutions/advanced/Instructions.ipynb index fb5b4ee..0bfdfab 100644 --- a/03-H_Multi_GPU_Parallelization/solutions/advanced/Instructions.ipynb +++ b/03-H_Multi_GPU_Parallelization/solutions/advanced/Instructions.ipynb @@ -4,12 +4,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# SC21 Tutorial: Efficient Distributed GPU Programming for Exascale\n", + "# ISC22 Tutorial: Efficient Distributed GPU Programming for Exascale\n", "\n", - "- Time: Sunday, 14 November 2021 8AM - 5PM CST\n", - "- Location: *online*\n", + "- Time: Sunday, 29 May 2022 9AM - 6PM CEST\n", + "- Location: Hall Y6, Congress Center Hamburg (CCH)\n", "- Program Link:\n", - " https://sc21.supercomputing.org/presentation/?id=tut138&sess=sess188\n", + " https://app.swapcard.com/widget/event/isc-high-performance-2022/planning/UGxhbm5pbmdfODYxMTQ2\n", "\n", "## Hands-On 3: Multi-GPU Parallelization with CUDA-aware MPI\n", "\n", @@ -70,7 +70,8 @@ "- Compute how many processes get (ny - 2) / size resp (ny - 2) /\n", " size + 1 rows\n", "- Adapt the computation of (`iy_start_global`)" - ] + ], + "id": "6307a649-048f-4dd3-82ee-863dfcfba33b" } ], "nbformat": 4, diff --git a/03-H_Multi_GPU_Parallelization/solutions/advanced/Instructions.md b/03-H_Multi_GPU_Parallelization/solutions/advanced/Instructions.md index 2600a58..f9794b2 100644 --- a/03-H_Multi_GPU_Parallelization/solutions/advanced/Instructions.md +++ b/03-H_Multi_GPU_Parallelization/solutions/advanced/Instructions.md @@ -1,9 +1,9 @@ -# SC21 Tutorial: Efficient Distributed GPU Programming for Exascale - -- Time: Sunday, 14 November 2021 8AM - 5PM CST -- Location: *online* -- Program Link: https://sc21.supercomputing.org/presentation/?id=tut138&sess=sess188 +# ISC22 Tutorial: Efficient Distributed GPU Programming for Exascale +- Time: Sunday, 29 May 2022 9AM - 6PM CEST +- Location: Hall Y6, Congress Center Hamburg (CCH) +- Program Link: + https://app.swapcard.com/widget/event/isc-high-performance-2022/planning/UGxhbm5pbmdfODYxMTQ2 ## Hands-On 3: Multi-GPU Parallelization with CUDA-aware MPI diff --git a/03-H_Multi_GPU_Parallelization/tasks/Instructions.ipynb b/03-H_Multi_GPU_Parallelization/tasks/Instructions.ipynb index fb5b4ee..0bfdfab 100644 --- a/03-H_Multi_GPU_Parallelization/tasks/Instructions.ipynb +++ b/03-H_Multi_GPU_Parallelization/tasks/Instructions.ipynb @@ -4,12 +4,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# SC21 Tutorial: Efficient Distributed GPU Programming for Exascale\n", + "# ISC22 Tutorial: Efficient Distributed GPU Programming for Exascale\n", "\n", - "- Time: Sunday, 14 November 2021 8AM - 5PM CST\n", - "- Location: *online*\n", + "- Time: Sunday, 29 May 2022 9AM - 6PM CEST\n", + "- Location: Hall Y6, Congress Center Hamburg (CCH)\n", "- Program Link:\n", - " https://sc21.supercomputing.org/presentation/?id=tut138&sess=sess188\n", + " https://app.swapcard.com/widget/event/isc-high-performance-2022/planning/UGxhbm5pbmdfODYxMTQ2\n", "\n", "## Hands-On 3: Multi-GPU Parallelization with CUDA-aware MPI\n", "\n", @@ -70,7 +70,8 @@ "- Compute how many processes get (ny - 2) / size resp (ny - 2) /\n", " size + 1 rows\n", "- Adapt the computation of (`iy_start_global`)" - ] + ], + "id": "6307a649-048f-4dd3-82ee-863dfcfba33b" } ], "nbformat": 4, diff --git a/03-H_Multi_GPU_Parallelization/tasks/Instructions.md b/03-H_Multi_GPU_Parallelization/tasks/Instructions.md index 2600a58..f9794b2 100644 --- a/03-H_Multi_GPU_Parallelization/tasks/Instructions.md +++ b/03-H_Multi_GPU_Parallelization/tasks/Instructions.md @@ -1,9 +1,9 @@ -# SC21 Tutorial: Efficient Distributed GPU Programming for Exascale - -- Time: Sunday, 14 November 2021 8AM - 5PM CST -- Location: *online* -- Program Link: https://sc21.supercomputing.org/presentation/?id=tut138&sess=sess188 +# ISC22 Tutorial: Efficient Distributed GPU Programming for Exascale +- Time: Sunday, 29 May 2022 9AM - 6PM CEST +- Location: Hall Y6, Congress Center Hamburg (CCH) +- Program Link: + https://app.swapcard.com/widget/event/isc-high-performance-2022/planning/UGxhbm5pbmdfODYxMTQ2 ## Hands-On 3: Multi-GPU Parallelization with CUDA-aware MPI diff --git a/04-L_Performance_and_debugging_tools/slides.pdf b/04-L_Performance_and_debugging_tools/slides.pdf index 35a7272..42a8a49 100644 Binary files a/04-L_Performance_and_debugging_tools/slides.pdf and b/04-L_Performance_and_debugging_tools/slides.pdf differ diff --git a/05-L_Optimization_techniques_for_multi-GPU_applications/slides.pdf b/05-L_Optimization_techniques_for_multi-GPU_applications/slides.pdf index 682c130..b76953f 100644 Binary files a/05-L_Optimization_techniques_for_multi-GPU_applications/slides.pdf and b/05-L_Optimization_techniques_for_multi-GPU_applications/slides.pdf differ diff --git a/06-H_Overlap_Communication_and_Computation_MPI/.master/Instructions.ipynb b/06-H_Overlap_Communication_and_Computation_MPI/.master/Instructions.ipynb index 16ddf37..3c7fa05 100644 --- a/06-H_Overlap_Communication_and_Computation_MPI/.master/Instructions.ipynb +++ b/06-H_Overlap_Communication_and_Computation_MPI/.master/Instructions.ipynb @@ -4,12 +4,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# SC21 Tutorial: Efficient Distributed GPU Programming for Exascale\n", + "# ISC22 Tutorial: Efficient Distributed GPU Programming for Exascale\n", "\n", - "- Time: Sunday, 14 November 2021 8AM - 5PM CST\n", - "- Location: *online*\n", + "- Time: Sunday, 29 May 2022 9AM - 6PM CEST\n", + "- Location: Hall Y6, Congress Center Hamburg (CCH)\n", "- Program Link:\n", - " https://sc21.supercomputing.org/presentation/?id=tut138&sess=sess188\n", + " https://app.swapcard.com/widget/event/isc-high-performance-2022/planning/UGxhbm5pbmdfODYxMTQ2\n", "\n", "## Hands-On 6: Overlap Communication and Computation with MPI\n", "\n", @@ -84,7 +84,8 @@ " boundary conditions using MPI\n", "- Destroy the additional cuda streams and events before ending the\n", " application" - ] + ], + "id": "846bc0ce-c189-4bb0-b5ea-7980298d88eb" } ], "nbformat": 4, diff --git a/06-H_Overlap_Communication_and_Computation_MPI/.master/Instructions.md b/06-H_Overlap_Communication_and_Computation_MPI/.master/Instructions.md index 4a21a66..f971296 100644 --- a/06-H_Overlap_Communication_and_Computation_MPI/.master/Instructions.md +++ b/06-H_Overlap_Communication_and_Computation_MPI/.master/Instructions.md @@ -1,9 +1,9 @@ -# SC21 Tutorial: Efficient Distributed GPU Programming for Exascale - -- Time: Sunday, 14 November 2021 8AM - 5PM CST -- Location: *online* -- Program Link: https://sc21.supercomputing.org/presentation/?id=tut138&sess=sess188 +# ISC22 Tutorial: Efficient Distributed GPU Programming for Exascale +- Time: Sunday, 29 May 2022 9AM - 6PM CEST +- Location: Hall Y6, Congress Center Hamburg (CCH) +- Program Link: + https://app.swapcard.com/widget/event/isc-high-performance-2022/planning/UGxhbm5pbmdfODYxMTQ2 ## Hands-On 6: Overlap Communication and Computation with MPI diff --git a/06-H_Overlap_Communication_and_Computation_MPI/solutions/Instructions.ipynb b/06-H_Overlap_Communication_and_Computation_MPI/solutions/Instructions.ipynb index 16ddf37..3c7fa05 100644 --- a/06-H_Overlap_Communication_and_Computation_MPI/solutions/Instructions.ipynb +++ b/06-H_Overlap_Communication_and_Computation_MPI/solutions/Instructions.ipynb @@ -4,12 +4,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# SC21 Tutorial: Efficient Distributed GPU Programming for Exascale\n", + "# ISC22 Tutorial: Efficient Distributed GPU Programming for Exascale\n", "\n", - "- Time: Sunday, 14 November 2021 8AM - 5PM CST\n", - "- Location: *online*\n", + "- Time: Sunday, 29 May 2022 9AM - 6PM CEST\n", + "- Location: Hall Y6, Congress Center Hamburg (CCH)\n", "- Program Link:\n", - " https://sc21.supercomputing.org/presentation/?id=tut138&sess=sess188\n", + " https://app.swapcard.com/widget/event/isc-high-performance-2022/planning/UGxhbm5pbmdfODYxMTQ2\n", "\n", "## Hands-On 6: Overlap Communication and Computation with MPI\n", "\n", @@ -84,7 +84,8 @@ " boundary conditions using MPI\n", "- Destroy the additional cuda streams and events before ending the\n", " application" - ] + ], + "id": "846bc0ce-c189-4bb0-b5ea-7980298d88eb" } ], "nbformat": 4, diff --git a/06-H_Overlap_Communication_and_Computation_MPI/tasks/Instructions.ipynb b/06-H_Overlap_Communication_and_Computation_MPI/tasks/Instructions.ipynb index 16ddf37..3c7fa05 100644 --- a/06-H_Overlap_Communication_and_Computation_MPI/tasks/Instructions.ipynb +++ b/06-H_Overlap_Communication_and_Computation_MPI/tasks/Instructions.ipynb @@ -4,12 +4,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# SC21 Tutorial: Efficient Distributed GPU Programming for Exascale\n", + "# ISC22 Tutorial: Efficient Distributed GPU Programming for Exascale\n", "\n", - "- Time: Sunday, 14 November 2021 8AM - 5PM CST\n", - "- Location: *online*\n", + "- Time: Sunday, 29 May 2022 9AM - 6PM CEST\n", + "- Location: Hall Y6, Congress Center Hamburg (CCH)\n", "- Program Link:\n", - " https://sc21.supercomputing.org/presentation/?id=tut138&sess=sess188\n", + " https://app.swapcard.com/widget/event/isc-high-performance-2022/planning/UGxhbm5pbmdfODYxMTQ2\n", "\n", "## Hands-On 6: Overlap Communication and Computation with MPI\n", "\n", @@ -84,7 +84,8 @@ " boundary conditions using MPI\n", "- Destroy the additional cuda streams and events before ending the\n", " application" - ] + ], + "id": "846bc0ce-c189-4bb0-b5ea-7980298d88eb" } ], "nbformat": 4, diff --git a/07-L_Overview_of_NCCL_and_NVSHMEM_in_MPI_Programs/slides.pdf b/07-L_Overview_of_NCCL_and_NVSHMEM_in_MPI_Programs/slides.pdf index 373bb1b..2e5ca68 100644 Binary files a/07-L_Overview_of_NCCL_and_NVSHMEM_in_MPI_Programs/slides.pdf and b/07-L_Overview_of_NCCL_and_NVSHMEM_in_MPI_Programs/slides.pdf differ diff --git a/08-H_NCCL_NVSHMEM/.master/NCCL/Instructions.ipynb b/08-H_NCCL_NVSHMEM/.master/NCCL/Instructions.ipynb index a9908cd..cd4d85a 100644 --- a/08-H_NCCL_NVSHMEM/.master/NCCL/Instructions.ipynb +++ b/08-H_NCCL_NVSHMEM/.master/NCCL/Instructions.ipynb @@ -4,12 +4,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# SC21 Tutorial: Efficient Distributed GPU Programming for Exascale\n", + "# ISC22 Tutorial: Efficient Distributed GPU Programming for Exascale\n", "\n", - "- Time: Sunday, 14 November 2021 8AM - 5PM CST\n", - "- Location: *online*\n", + "- Time: Sunday, 29 May 2022 9AM - 6PM CEST\n", + "- Location: Hall Y6, Congress Center Hamburg (CCH)\n", "- Program Link:\n", - " https://sc21.supercomputing.org/presentation/?id=tut138&sess=sess188\n", + " https://app.swapcard.com/widget/event/isc-high-performance-2022/planning/UGxhbm5pbmdfODYxMTQ2\n", "\n", "## Hands-On 8-NCCL: Using NCCL for Inter-GPU Communication\n", "\n", @@ -47,7 +47,8 @@ "Study the performance by glimpsing at the profile generated with\n", "`make profile`. For `make run` and `make profile` the environment\n", "variable `NP` can be set to change the number of processes." - ] + ], + "id": "8c9e9e42-bda5-4b52-a322-0e72171476c5" } ], "nbformat": 4, diff --git a/08-H_NCCL_NVSHMEM/.master/NCCL/Instructions.md b/08-H_NCCL_NVSHMEM/.master/NCCL/Instructions.md index c99a7c4..465181f 100644 --- a/08-H_NCCL_NVSHMEM/.master/NCCL/Instructions.md +++ b/08-H_NCCL_NVSHMEM/.master/NCCL/Instructions.md @@ -1,9 +1,9 @@ -# SC21 Tutorial: Efficient Distributed GPU Programming for Exascale - -- Time: Sunday, 14 November 2021 8AM - 5PM CST -- Location: *online* -- Program Link: https://sc21.supercomputing.org/presentation/?id=tut138&sess=sess188 +# ISC22 Tutorial: Efficient Distributed GPU Programming for Exascale +- Time: Sunday, 29 May 2022 9AM - 6PM CEST +- Location: Hall Y6, Congress Center Hamburg (CCH) +- Program Link: + https://app.swapcard.com/widget/event/isc-high-performance-2022/planning/UGxhbm5pbmdfODYxMTQ2 ## Hands-On 8-NCCL: Using NCCL for Inter-GPU Communication diff --git a/08-H_NCCL_NVSHMEM/.master/NCCL/jacobi.cpp b/08-H_NCCL_NVSHMEM/.master/NCCL/jacobi.cpp index cae5722..33760e2 100644 --- a/08-H_NCCL_NVSHMEM/.master/NCCL/jacobi.cpp +++ b/08-H_NCCL_NVSHMEM/.master/NCCL/jacobi.cpp @@ -293,13 +293,13 @@ int main(int argc, char* argv[]) { NCCL_CALL(ncclGroupEnd()); CUDA_RT_CALL(cudaStreamSynchronize(compute_stream)); #else - MPI_CALL(MPI_Sendrecv(a_new + iy_start * nx, nx, MPI_REAL_TYPE, top, 0, + MPI_CALL(MPI_Sendrecv(a_new + iy_start * nx, nx, MPI_REAL_TYPE, top, 0, a_new + (iy_end * nx), nx, MPI_REAL_TYPE, bottom, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE)); MPI_CALL(MPI_Sendrecv(a_new + (iy_end - 1) * nx, nx, MPI_REAL_TYPE, bottom, 0, a_new, nx, - MPI_REAL_TYPE, top, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE)); + MPI_REAL_TYPE, top, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE)); #endif - std::swap(a_new, a); + std::swap(a_new, a); } POP_RANGE @@ -326,7 +326,7 @@ int main(int argc, char* argv[]) { CUDA_RT_CALL(cudaStreamWaitEvent(push_stream, reset_l2norm_done, 0)); calculate_norm = (iter % nccheck) == 0 || (!csv && (iter % 100) == 0); - launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_start + 1), (iy_end - 1), nx, calculate_norm, + launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_start + 1), (iy_end - 1), nx, calculate_norm, compute_stream); launch_jacobi_kernel(a_new, a, l2_norm_d, iy_start, (iy_start + 1), nx, calculate_norm, @@ -346,7 +346,7 @@ int main(int argc, char* argv[]) { const int bottom = (rank + 1) % size; // Apply periodic boundary conditions - //TODO: Modify the lable for the RANGE, and replace MPI_Sendrecv with ncclSend and ncclRecv calls + //TODO: Modify the lable for the RANGE, and replace MPI_Sendrecv with ncclSend and ncclRecv calls // using the nccl communicator and push_stream. // Remember to use ncclGroupStart() and ncclGroupEnd() #ifdef SOLUTION @@ -358,14 +358,14 @@ int main(int argc, char* argv[]) { NCCL_CALL(ncclSend(a_new + iy_start * nx, nx, NCCL_REAL_TYPE, top, nccl_comm, push_stream)); NCCL_CALL(ncclGroupEnd()); #else - PUSH_RANGE("MPI", 5) + PUSH_RANGE("MPI", 5) MPI_CALL(MPI_Sendrecv(a_new + iy_start * nx, nx, MPI_REAL_TYPE, top, 0, a_new + (iy_end * nx), nx, MPI_REAL_TYPE, bottom, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE)); MPI_CALL(MPI_Sendrecv(a_new + (iy_end - 1) * nx, nx, MPI_REAL_TYPE, bottom, 0, a_new, nx, MPI_REAL_TYPE, top, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE)); #endif - CUDA_RT_CALL(cudaEventRecord(push_done, push_stream)); + CUDA_RT_CALL(cudaEventRecord(push_done, push_stream)); POP_RANGE if (calculate_norm) { @@ -410,13 +410,13 @@ int main(int argc, char* argv[]) { if (rank == 0 && result_correct) { if (csv) { - //TODO: Dont forget to change your output lable from mpi_overlap to nccl_overlap + //TODO: Dont forget to change your output lable from mpi_overlap to nccl_overlap #ifdef SOLUTION printf("nccl_overlap, %d, %d, %d, %d, %d, 1, %f, %f\n", nx, ny, iter_max, nccheck, size, #else - printf("mpi_overlap, %d, %d, %d, %d, %d, 1, %f, %f\n", nx, ny, iter_max, nccheck, size, + printf("mpi_overlap, %d, %d, %d, %d, %d, 1, %f, %f\n", nx, ny, iter_max, nccheck, size, #endif - (stop - start), runtime_serial); + (stop - start), runtime_serial); } else { printf("Num GPUs: %d.\n", size); printf( diff --git a/08-H_NCCL_NVSHMEM/.master/NVSHMEM/Instructions.ipynb b/08-H_NCCL_NVSHMEM/.master/NVSHMEM/Instructions.ipynb index b6476ef..eb6077a 100644 --- a/08-H_NCCL_NVSHMEM/.master/NVSHMEM/Instructions.ipynb +++ b/08-H_NCCL_NVSHMEM/.master/NVSHMEM/Instructions.ipynb @@ -4,12 +4,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# SC21 Tutorial: Efficient Distributed GPU Programming for Exascale\n", + "# ISC22 Tutorial: Efficient Distributed GPU Programming for Exascale\n", "\n", - "- Time: Sunday, 14 November 2021 8AM - 5PM CST\n", - "- Location: *online*\n", + "- Time: Sunday, 29 May 2022 9AM - 6PM CEST\n", + "- Location: Hall Y6, Congress Center Hamburg (CCH)\n", "- Program Link:\n", - " https://sc21.supercomputing.org/presentation/?id=tut138&sess=sess188\n", + " https://app.swapcard.com/widget/event/isc-high-performance-2022/planning/UGxhbm5pbmdfODYxMTQ2\n", "\n", "## Hands-On 8-NVSHMEM: Host-initiated Communication with NVSHMEM\n", "\n", @@ -49,21 +49,22 @@ "\n", "Study the performance by glimpsing at the profile generated with\n", "`make profile`. For `make run` and `make profile` the environment\n", - "variable `NP` can be set to change the number of processes. \n", + "variable `NP` can be set to change the number of processes.\n", "\n", "#### Note\n", "\n", "The Slurm installation on JUWELS-Booster sets `CUDA_VISIBLE_DEVICES`\n", "automatically so that each spawned process only sees the GPU it should\n", - "use (see [GPU Devices](https://apps.fz-juelich.de/jsc/hps/juwels/booster-overview.html#gpu-devices)\n", + "use (see [GPU\n", + "Devices](https://apps.fz-juelich.de/jsc/hps/juwels/booster-overview.html#gpu-devices)\n", "in the JUWELS Booster Overview documentation). This is not supported for\n", "NVSHMEM. The automatic setting of `CUDA_VISIBLE_DEVICES` can be disabled\n", - "by setting `CUDA_VISIBLE_DEVICES=0,1,2,3` in the shell that executes srun.\n", - "With `CUDA_VISIBLE_DEVICES` set all spawned processes can see all GPUs\n", - "listed. This is automatically done for the `sanitize`, `run` and\n", - "`profile` make targets.\n", - "\n", - ] + "by setting `CUDA_VISIBLE_DEVICES=0,1,2,3` in the shell that executes\n", + "srun. With `CUDA_VISIBLE_DEVICES` set all spawned processes can see all\n", + "GPUs listed. This is automatically done for the `sanitize`, `run` and\n", + "`profile` make targets." + ], + "id": "4153946b-60de-494a-ad07-7ecb34a91c84" } ], "nbformat": 4, diff --git a/08-H_NCCL_NVSHMEM/.master/NVSHMEM/Instructions.md b/08-H_NCCL_NVSHMEM/.master/NVSHMEM/Instructions.md index 0100bbf..a263c17 100644 --- a/08-H_NCCL_NVSHMEM/.master/NVSHMEM/Instructions.md +++ b/08-H_NCCL_NVSHMEM/.master/NVSHMEM/Instructions.md @@ -1,9 +1,9 @@ -# SC21 Tutorial: Efficient Distributed GPU Programming for Exascale - -- Time: Sunday, 14 November 2021 8AM - 5PM CST -- Location: *online* -- Program Link: https://sc21.supercomputing.org/presentation/?id=tut138&sess=sess188 +# ISC22 Tutorial: Efficient Distributed GPU Programming for Exascale +- Time: Sunday, 29 May 2022 9AM - 6PM CEST +- Location: Hall Y6, Congress Center Hamburg (CCH) +- Program Link: + https://app.swapcard.com/widget/event/isc-high-performance-2022/planning/UGxhbm5pbmdfODYxMTQ2 ## Hands-On 8-NVSHMEM: Host-initiated Communication with NVSHMEM diff --git a/08-H_NCCL_NVSHMEM/.master/NVSHMEM/Makefile b/08-H_NCCL_NVSHMEM/.master/NVSHMEM/Makefile index 9e301fd..e711f37 100644 --- a/08-H_NCCL_NVSHMEM/.master/NVSHMEM/Makefile +++ b/08-H_NCCL_NVSHMEM/.master/NVSHMEM/Makefile @@ -1,6 +1,7 @@ # Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. NP ?= 4 NVCC=nvcc +N_D_C_VMM=1 #Enabled to hide warning and errors only found in NVSHMEM/2.5.0 to be fixed in next release JSC_SUBMIT_CMD ?= srun --cpu-bind=socket --gres=gpu:4 --ntasks-per-node 4 C_V_D ?= 0,1,2,3 CUDA_HOME ?= /usr/local/cuda @@ -25,7 +26,7 @@ else NVCC_FLAGS = -DHAVE_CUB endif NVCC_FLAGS += -dc -Xcompiler -fopenmp -lineinfo -DUSE_NVTX -lnvToolsExt $(GENCODE_FLAGS) -std=c++14 -I$(NVSHMEM_HOME)/include -I$(MPI_HOME)/include -NVCC_LDFLAGS = -ccbin=mpic++ -L$(NVSHMEM_HOME)/lib -lnvshmem -L$(MPI_HOME)/lib -lmpi -L$(CUDA_HOME)/lib64 -lcuda -lcudart -lnvToolsExt +NVCC_LDFLAGS = -ccbin=mpic++ -L$(NVSHMEM_HOME)/lib -lnvshmem -L$(MPI_HOME)/lib -lmpi -L$(CUDA_HOME)/lib64 -lcuda -lcudart -lnvToolsExt -lnvidia-ml jacobi: Makefile jacobi.cu $(NVCC) $(NVCC_FLAGS) jacobi.cu -c -o jacobi.o $(NVCC) $(GENCODE_FLAGS) jacobi.o -o jacobi $(NVCC_LDFLAGS) diff --git a/08-H_NCCL_NVSHMEM/.master/NVSHMEM/jacobi.cu b/08-H_NCCL_NVSHMEM/.master/NVSHMEM/jacobi.cu index cdc276e..e755c7c 100644 --- a/08-H_NCCL_NVSHMEM/.master/NVSHMEM/jacobi.cu +++ b/08-H_NCCL_NVSHMEM/.master/NVSHMEM/jacobi.cu @@ -317,19 +317,6 @@ int main(int argc, char* argv[]) { real* l2_norm_h; CUDA_RT_CALL(cudaMallocHost(&l2_norm_h, sizeof(real))); - PUSH_RANGE("MPI_Warmup", 5) - for (int i = 0; i < 10; ++i) { - const int top = rank > 0 ? rank - 1 : (size - 1); - const int bottom = (rank + 1) % size; - MPI_CALL(MPI_Sendrecv(a_new + iy_start * nx, nx, MPI_REAL_TYPE, top, 0, - a_new + (iy_end * nx), nx, MPI_REAL_TYPE, bottom, 0, MPI_COMM_WORLD, - MPI_STATUS_IGNORE)); - MPI_CALL(MPI_Sendrecv(a_new + (iy_end - 1) * nx, nx, MPI_REAL_TYPE, bottom, 0, a_new, nx, - MPI_REAL_TYPE, top, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE)); - std::swap(a_new, a); - } - POP_RANGE - CUDA_RT_CALL(cudaDeviceSynchronize()); if (!csv && 0 == rank) { @@ -354,25 +341,25 @@ int main(int argc, char* argv[]) { CUDA_RT_CALL(cudaStreamWaitEvent(push_stream, reset_l2norm_done, 0)); calculate_norm = (iter % nccheck) == 0 || (!csv && (iter % 100) == 0); - launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_start + 1), (iy_end - 1), nx, calculate_norm, compute_stream); + launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_start + 1), (iy_end - 1), nx, calculate_norm, compute_stream); - launch_jacobi_kernel(a_new, a, l2_norm_d, iy_start, (iy_start + 1), nx, calculate_norm, push_stream); - - launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_end - 1), iy_end, nx, calculate_norm, push_stream); + launch_jacobi_kernel(a_new, a, l2_norm_d, iy_start, (iy_start + 1), nx, calculate_norm, push_stream); + + launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_end - 1), iy_end, nx, calculate_norm, push_stream); - CUDA_RT_CALL(cudaEventRecord(push_prep_done, push_stream)); + CUDA_RT_CALL(cudaEventRecord(push_prep_done, push_stream)); if (calculate_norm) { - CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, push_prep_done, 0)); + CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, push_prep_done, 0)); CUDA_RT_CALL(cudaMemcpyAsync(l2_norm_h, l2_norm_d, sizeof(real), cudaMemcpyDeviceToHost, compute_stream)); } - //TODO: Replace MPI communication with Host initiated NVSHMEM calls + //TODO: Replace MPI communication with Host initiated NVSHMEM calls // Apply periodic boundary conditions #ifdef SOLUTION - PUSH_RANGE("NVSHMEM", 5) - nvshmemx_float_put_on_stream(a_new + iy_top_lower_boundary_idx * nx, a_new + iy_start * nx, nx, top, push_stream); + PUSH_RANGE("NVSHMEM", 5) + nvshmemx_float_put_on_stream(a_new + iy_top_lower_boundary_idx * nx, a_new + iy_start * nx, nx, top, push_stream); nvshmemx_float_put_on_stream(a_new + iy_bottom_upper_boundary_idx * nx, a_new + (iy_end - 1) * nx, nx, bottom, push_stream); #else PUSH_RANGE("MPI", 5) @@ -382,12 +369,12 @@ int main(int argc, char* argv[]) { MPI_CALL(MPI_Sendrecv(a_new + (iy_end - 1) * nx, nx, MPI_REAL_TYPE, bottom, 0, a_new, nx, MPI_REAL_TYPE, top, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE)); #endif - CUDA_RT_CALL(cudaEventRecord(push_done, push_stream)); + CUDA_RT_CALL(cudaEventRecord(push_done, push_stream)); POP_RANGE CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, push_done, 0)); - //TODO: add necessary inter PE synchronization using the nvshmemx_barrier_all_on_stream(...) + //TODO: add necessary inter PE synchronization using the nvshmemx_barrier_all_on_stream(...) #ifdef SOLUTION nvshmemx_barrier_all_on_stream(compute_stream); #endif @@ -434,7 +421,7 @@ int main(int argc, char* argv[]) { if (csv) { //TODO: Replace MPI with NVSHMEM for your output #ifdef SOLUTION - printf("nvshmem, %d, %d, %d, %d, %d, 1, %f, %f\n", nx, ny, iter_max, nccheck, size, + printf("nvshmem, %d, %d, %d, %d, %d, 1, %f, %f\n", nx, ny, iter_max, nccheck, size, #else printf("mpi, %d, %d, %d, %d, %d, 1, %f, %f\n", nx, ny, iter_max, nccheck, size, #endif diff --git a/08-H_NCCL_NVSHMEM/solutions/NCCL/Instructions.ipynb b/08-H_NCCL_NVSHMEM/solutions/NCCL/Instructions.ipynb index a9908cd..cd4d85a 100644 --- a/08-H_NCCL_NVSHMEM/solutions/NCCL/Instructions.ipynb +++ b/08-H_NCCL_NVSHMEM/solutions/NCCL/Instructions.ipynb @@ -4,12 +4,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# SC21 Tutorial: Efficient Distributed GPU Programming for Exascale\n", + "# ISC22 Tutorial: Efficient Distributed GPU Programming for Exascale\n", "\n", - "- Time: Sunday, 14 November 2021 8AM - 5PM CST\n", - "- Location: *online*\n", + "- Time: Sunday, 29 May 2022 9AM - 6PM CEST\n", + "- Location: Hall Y6, Congress Center Hamburg (CCH)\n", "- Program Link:\n", - " https://sc21.supercomputing.org/presentation/?id=tut138&sess=sess188\n", + " https://app.swapcard.com/widget/event/isc-high-performance-2022/planning/UGxhbm5pbmdfODYxMTQ2\n", "\n", "## Hands-On 8-NCCL: Using NCCL for Inter-GPU Communication\n", "\n", @@ -47,7 +47,8 @@ "Study the performance by glimpsing at the profile generated with\n", "`make profile`. For `make run` and `make profile` the environment\n", "variable `NP` can be set to change the number of processes." - ] + ], + "id": "8c9e9e42-bda5-4b52-a322-0e72171476c5" } ], "nbformat": 4, diff --git a/08-H_NCCL_NVSHMEM/solutions/NCCL/Instructions.md b/08-H_NCCL_NVSHMEM/solutions/NCCL/Instructions.md index c99a7c4..465181f 100644 --- a/08-H_NCCL_NVSHMEM/solutions/NCCL/Instructions.md +++ b/08-H_NCCL_NVSHMEM/solutions/NCCL/Instructions.md @@ -1,9 +1,9 @@ -# SC21 Tutorial: Efficient Distributed GPU Programming for Exascale - -- Time: Sunday, 14 November 2021 8AM - 5PM CST -- Location: *online* -- Program Link: https://sc21.supercomputing.org/presentation/?id=tut138&sess=sess188 +# ISC22 Tutorial: Efficient Distributed GPU Programming for Exascale +- Time: Sunday, 29 May 2022 9AM - 6PM CEST +- Location: Hall Y6, Congress Center Hamburg (CCH) +- Program Link: + https://app.swapcard.com/widget/event/isc-high-performance-2022/planning/UGxhbm5pbmdfODYxMTQ2 ## Hands-On 8-NCCL: Using NCCL for Inter-GPU Communication diff --git a/08-H_NCCL_NVSHMEM/solutions/NCCL/jacobi.cpp b/08-H_NCCL_NVSHMEM/solutions/NCCL/jacobi.cpp index 202d76e..0b0b018 100644 --- a/08-H_NCCL_NVSHMEM/solutions/NCCL/jacobi.cpp +++ b/08-H_NCCL_NVSHMEM/solutions/NCCL/jacobi.cpp @@ -281,7 +281,7 @@ int main(int argc, char* argv[]) { NCCL_CALL(ncclSend(a_new + iy_start * nx, nx, NCCL_REAL_TYPE, top, nccl_comm, compute_stream)); NCCL_CALL(ncclGroupEnd()); CUDA_RT_CALL(cudaStreamSynchronize(compute_stream)); - std::swap(a_new, a); + std::swap(a_new, a); } POP_RANGE @@ -308,7 +308,7 @@ int main(int argc, char* argv[]) { CUDA_RT_CALL(cudaStreamWaitEvent(push_stream, reset_l2norm_done, 0)); calculate_norm = (iter % nccheck) == 0 || (!csv && (iter % 100) == 0); - launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_start + 1), (iy_end - 1), nx, calculate_norm, + launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_start + 1), (iy_end - 1), nx, calculate_norm, compute_stream); launch_jacobi_kernel(a_new, a, l2_norm_d, iy_start, (iy_start + 1), nx, calculate_norm, @@ -328,7 +328,7 @@ int main(int argc, char* argv[]) { const int bottom = (rank + 1) % size; // Apply periodic boundary conditions - //TODO: Modify the lable for the RANGE, and replace MPI_Sendrecv with ncclSend and ncclRecv calls + //TODO: Modify the lable for the RANGE, and replace MPI_Sendrecv with ncclSend and ncclRecv calls // using the nccl communicator and push_stream. // Remember to use ncclGroupStart() and ncclGroupEnd() PUSH_RANGE("NCCL_LAUNCH", 5) @@ -338,7 +338,7 @@ int main(int argc, char* argv[]) { NCCL_CALL(ncclRecv(a_new + (iy_end * nx), nx, NCCL_REAL_TYPE, bottom, nccl_comm, push_stream)); NCCL_CALL(ncclSend(a_new + iy_start * nx, nx, NCCL_REAL_TYPE, top, nccl_comm, push_stream)); NCCL_CALL(ncclGroupEnd()); - CUDA_RT_CALL(cudaEventRecord(push_done, push_stream)); + CUDA_RT_CALL(cudaEventRecord(push_done, push_stream)); POP_RANGE if (calculate_norm) { @@ -383,9 +383,9 @@ int main(int argc, char* argv[]) { if (rank == 0 && result_correct) { if (csv) { - //TODO: Dont forget to change your output lable from mpi_overlap to nccl_overlap + //TODO: Dont forget to change your output lable from mpi_overlap to nccl_overlap printf("nccl_overlap, %d, %d, %d, %d, %d, 1, %f, %f\n", nx, ny, iter_max, nccheck, size, - (stop - start), runtime_serial); + (stop - start), runtime_serial); } else { printf("Num GPUs: %d.\n", size); printf( diff --git a/08-H_NCCL_NVSHMEM/solutions/NVSHMEM/Instructions.ipynb b/08-H_NCCL_NVSHMEM/solutions/NVSHMEM/Instructions.ipynb index b6476ef..eb6077a 100644 --- a/08-H_NCCL_NVSHMEM/solutions/NVSHMEM/Instructions.ipynb +++ b/08-H_NCCL_NVSHMEM/solutions/NVSHMEM/Instructions.ipynb @@ -4,12 +4,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# SC21 Tutorial: Efficient Distributed GPU Programming for Exascale\n", + "# ISC22 Tutorial: Efficient Distributed GPU Programming for Exascale\n", "\n", - "- Time: Sunday, 14 November 2021 8AM - 5PM CST\n", - "- Location: *online*\n", + "- Time: Sunday, 29 May 2022 9AM - 6PM CEST\n", + "- Location: Hall Y6, Congress Center Hamburg (CCH)\n", "- Program Link:\n", - " https://sc21.supercomputing.org/presentation/?id=tut138&sess=sess188\n", + " https://app.swapcard.com/widget/event/isc-high-performance-2022/planning/UGxhbm5pbmdfODYxMTQ2\n", "\n", "## Hands-On 8-NVSHMEM: Host-initiated Communication with NVSHMEM\n", "\n", @@ -49,21 +49,22 @@ "\n", "Study the performance by glimpsing at the profile generated with\n", "`make profile`. For `make run` and `make profile` the environment\n", - "variable `NP` can be set to change the number of processes. \n", + "variable `NP` can be set to change the number of processes.\n", "\n", "#### Note\n", "\n", "The Slurm installation on JUWELS-Booster sets `CUDA_VISIBLE_DEVICES`\n", "automatically so that each spawned process only sees the GPU it should\n", - "use (see [GPU Devices](https://apps.fz-juelich.de/jsc/hps/juwels/booster-overview.html#gpu-devices)\n", + "use (see [GPU\n", + "Devices](https://apps.fz-juelich.de/jsc/hps/juwels/booster-overview.html#gpu-devices)\n", "in the JUWELS Booster Overview documentation). This is not supported for\n", "NVSHMEM. The automatic setting of `CUDA_VISIBLE_DEVICES` can be disabled\n", - "by setting `CUDA_VISIBLE_DEVICES=0,1,2,3` in the shell that executes srun.\n", - "With `CUDA_VISIBLE_DEVICES` set all spawned processes can see all GPUs\n", - "listed. This is automatically done for the `sanitize`, `run` and\n", - "`profile` make targets.\n", - "\n", - ] + "by setting `CUDA_VISIBLE_DEVICES=0,1,2,3` in the shell that executes\n", + "srun. With `CUDA_VISIBLE_DEVICES` set all spawned processes can see all\n", + "GPUs listed. This is automatically done for the `sanitize`, `run` and\n", + "`profile` make targets." + ], + "id": "4153946b-60de-494a-ad07-7ecb34a91c84" } ], "nbformat": 4, diff --git a/08-H_NCCL_NVSHMEM/solutions/NVSHMEM/Instructions.md b/08-H_NCCL_NVSHMEM/solutions/NVSHMEM/Instructions.md index 0100bbf..a263c17 100644 --- a/08-H_NCCL_NVSHMEM/solutions/NVSHMEM/Instructions.md +++ b/08-H_NCCL_NVSHMEM/solutions/NVSHMEM/Instructions.md @@ -1,9 +1,9 @@ -# SC21 Tutorial: Efficient Distributed GPU Programming for Exascale - -- Time: Sunday, 14 November 2021 8AM - 5PM CST -- Location: *online* -- Program Link: https://sc21.supercomputing.org/presentation/?id=tut138&sess=sess188 +# ISC22 Tutorial: Efficient Distributed GPU Programming for Exascale +- Time: Sunday, 29 May 2022 9AM - 6PM CEST +- Location: Hall Y6, Congress Center Hamburg (CCH) +- Program Link: + https://app.swapcard.com/widget/event/isc-high-performance-2022/planning/UGxhbm5pbmdfODYxMTQ2 ## Hands-On 8-NVSHMEM: Host-initiated Communication with NVSHMEM diff --git a/08-H_NCCL_NVSHMEM/solutions/NVSHMEM/Makefile b/08-H_NCCL_NVSHMEM/solutions/NVSHMEM/Makefile index 9e301fd..e711f37 100644 --- a/08-H_NCCL_NVSHMEM/solutions/NVSHMEM/Makefile +++ b/08-H_NCCL_NVSHMEM/solutions/NVSHMEM/Makefile @@ -1,6 +1,7 @@ # Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. NP ?= 4 NVCC=nvcc +N_D_C_VMM=1 #Enabled to hide warning and errors only found in NVSHMEM/2.5.0 to be fixed in next release JSC_SUBMIT_CMD ?= srun --cpu-bind=socket --gres=gpu:4 --ntasks-per-node 4 C_V_D ?= 0,1,2,3 CUDA_HOME ?= /usr/local/cuda @@ -25,7 +26,7 @@ else NVCC_FLAGS = -DHAVE_CUB endif NVCC_FLAGS += -dc -Xcompiler -fopenmp -lineinfo -DUSE_NVTX -lnvToolsExt $(GENCODE_FLAGS) -std=c++14 -I$(NVSHMEM_HOME)/include -I$(MPI_HOME)/include -NVCC_LDFLAGS = -ccbin=mpic++ -L$(NVSHMEM_HOME)/lib -lnvshmem -L$(MPI_HOME)/lib -lmpi -L$(CUDA_HOME)/lib64 -lcuda -lcudart -lnvToolsExt +NVCC_LDFLAGS = -ccbin=mpic++ -L$(NVSHMEM_HOME)/lib -lnvshmem -L$(MPI_HOME)/lib -lmpi -L$(CUDA_HOME)/lib64 -lcuda -lcudart -lnvToolsExt -lnvidia-ml jacobi: Makefile jacobi.cu $(NVCC) $(NVCC_FLAGS) jacobi.cu -c -o jacobi.o $(NVCC) $(GENCODE_FLAGS) jacobi.o -o jacobi $(NVCC_LDFLAGS) diff --git a/08-H_NCCL_NVSHMEM/solutions/NVSHMEM/jacobi.cu b/08-H_NCCL_NVSHMEM/solutions/NVSHMEM/jacobi.cu index 6fc3906..d293c62 100644 --- a/08-H_NCCL_NVSHMEM/solutions/NVSHMEM/jacobi.cu +++ b/08-H_NCCL_NVSHMEM/solutions/NVSHMEM/jacobi.cu @@ -304,19 +304,6 @@ int main(int argc, char* argv[]) { real* l2_norm_h; CUDA_RT_CALL(cudaMallocHost(&l2_norm_h, sizeof(real))); - PUSH_RANGE("MPI_Warmup", 5) - for (int i = 0; i < 10; ++i) { - const int top = rank > 0 ? rank - 1 : (size - 1); - const int bottom = (rank + 1) % size; - MPI_CALL(MPI_Sendrecv(a_new + iy_start * nx, nx, MPI_REAL_TYPE, top, 0, - a_new + (iy_end * nx), nx, MPI_REAL_TYPE, bottom, 0, MPI_COMM_WORLD, - MPI_STATUS_IGNORE)); - MPI_CALL(MPI_Sendrecv(a_new + (iy_end - 1) * nx, nx, MPI_REAL_TYPE, bottom, 0, a_new, nx, - MPI_REAL_TYPE, top, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE)); - std::swap(a_new, a); - } - POP_RANGE - CUDA_RT_CALL(cudaDeviceSynchronize()); if (!csv && 0 == rank) { @@ -341,31 +328,31 @@ int main(int argc, char* argv[]) { CUDA_RT_CALL(cudaStreamWaitEvent(push_stream, reset_l2norm_done, 0)); calculate_norm = (iter % nccheck) == 0 || (!csv && (iter % 100) == 0); - launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_start + 1), (iy_end - 1), nx, calculate_norm, compute_stream); + launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_start + 1), (iy_end - 1), nx, calculate_norm, compute_stream); - launch_jacobi_kernel(a_new, a, l2_norm_d, iy_start, (iy_start + 1), nx, calculate_norm, push_stream); - - launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_end - 1), iy_end, nx, calculate_norm, push_stream); + launch_jacobi_kernel(a_new, a, l2_norm_d, iy_start, (iy_start + 1), nx, calculate_norm, push_stream); + + launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_end - 1), iy_end, nx, calculate_norm, push_stream); - CUDA_RT_CALL(cudaEventRecord(push_prep_done, push_stream)); + CUDA_RT_CALL(cudaEventRecord(push_prep_done, push_stream)); if (calculate_norm) { - CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, push_prep_done, 0)); + CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, push_prep_done, 0)); CUDA_RT_CALL(cudaMemcpyAsync(l2_norm_h, l2_norm_d, sizeof(real), cudaMemcpyDeviceToHost, compute_stream)); } - //TODO: Replace MPI communication with Host initiated NVSHMEM calls + //TODO: Replace MPI communication with Host initiated NVSHMEM calls // Apply periodic boundary conditions - PUSH_RANGE("NVSHMEM", 5) - nvshmemx_float_put_on_stream(a_new + iy_top_lower_boundary_idx * nx, a_new + iy_start * nx, nx, top, push_stream); + PUSH_RANGE("NVSHMEM", 5) + nvshmemx_float_put_on_stream(a_new + iy_top_lower_boundary_idx * nx, a_new + iy_start * nx, nx, top, push_stream); nvshmemx_float_put_on_stream(a_new + iy_bottom_upper_boundary_idx * nx, a_new + (iy_end - 1) * nx, nx, bottom, push_stream); - CUDA_RT_CALL(cudaEventRecord(push_done, push_stream)); + CUDA_RT_CALL(cudaEventRecord(push_done, push_stream)); POP_RANGE CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, push_done, 0)); - //TODO: add necessary inter PE synchronization using the nvshmemx_barrier_all_on_stream(...) + //TODO: add necessary inter PE synchronization using the nvshmemx_barrier_all_on_stream(...) nvshmemx_barrier_all_on_stream(compute_stream); if (calculate_norm) { @@ -409,7 +396,7 @@ int main(int argc, char* argv[]) { if (rank == 0 && result_correct) { if (csv) { //TODO: Replace MPI with NVSHMEM for your output - printf("nvshmem, %d, %d, %d, %d, %d, 1, %f, %f\n", nx, ny, iter_max, nccheck, size, + printf("nvshmem, %d, %d, %d, %d, %d, 1, %f, %f\n", nx, ny, iter_max, nccheck, size, (stop - start), runtime_serial); } else { printf("Num GPUs: %d.\n", size); diff --git a/08-H_NCCL_NVSHMEM/tasks/NCCL/Instructions.ipynb b/08-H_NCCL_NVSHMEM/tasks/NCCL/Instructions.ipynb index a9908cd..cd4d85a 100644 --- a/08-H_NCCL_NVSHMEM/tasks/NCCL/Instructions.ipynb +++ b/08-H_NCCL_NVSHMEM/tasks/NCCL/Instructions.ipynb @@ -4,12 +4,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# SC21 Tutorial: Efficient Distributed GPU Programming for Exascale\n", + "# ISC22 Tutorial: Efficient Distributed GPU Programming for Exascale\n", "\n", - "- Time: Sunday, 14 November 2021 8AM - 5PM CST\n", - "- Location: *online*\n", + "- Time: Sunday, 29 May 2022 9AM - 6PM CEST\n", + "- Location: Hall Y6, Congress Center Hamburg (CCH)\n", "- Program Link:\n", - " https://sc21.supercomputing.org/presentation/?id=tut138&sess=sess188\n", + " https://app.swapcard.com/widget/event/isc-high-performance-2022/planning/UGxhbm5pbmdfODYxMTQ2\n", "\n", "## Hands-On 8-NCCL: Using NCCL for Inter-GPU Communication\n", "\n", @@ -47,7 +47,8 @@ "Study the performance by glimpsing at the profile generated with\n", "`make profile`. For `make run` and `make profile` the environment\n", "variable `NP` can be set to change the number of processes." - ] + ], + "id": "8c9e9e42-bda5-4b52-a322-0e72171476c5" } ], "nbformat": 4, diff --git a/08-H_NCCL_NVSHMEM/tasks/NCCL/Instructions.md b/08-H_NCCL_NVSHMEM/tasks/NCCL/Instructions.md index c99a7c4..465181f 100644 --- a/08-H_NCCL_NVSHMEM/tasks/NCCL/Instructions.md +++ b/08-H_NCCL_NVSHMEM/tasks/NCCL/Instructions.md @@ -1,9 +1,9 @@ -# SC21 Tutorial: Efficient Distributed GPU Programming for Exascale - -- Time: Sunday, 14 November 2021 8AM - 5PM CST -- Location: *online* -- Program Link: https://sc21.supercomputing.org/presentation/?id=tut138&sess=sess188 +# ISC22 Tutorial: Efficient Distributed GPU Programming for Exascale +- Time: Sunday, 29 May 2022 9AM - 6PM CEST +- Location: Hall Y6, Congress Center Hamburg (CCH) +- Program Link: + https://app.swapcard.com/widget/event/isc-high-performance-2022/planning/UGxhbm5pbmdfODYxMTQ2 ## Hands-On 8-NCCL: Using NCCL for Inter-GPU Communication diff --git a/08-H_NCCL_NVSHMEM/tasks/NCCL/jacobi.cpp b/08-H_NCCL_NVSHMEM/tasks/NCCL/jacobi.cpp index 157b320..3588d21 100644 --- a/08-H_NCCL_NVSHMEM/tasks/NCCL/jacobi.cpp +++ b/08-H_NCCL_NVSHMEM/tasks/NCCL/jacobi.cpp @@ -260,12 +260,12 @@ int main(int argc, char* argv[]) { // on the compute_stream. // Remeber that a group of ncclRecv and ncclSend should be within a ncclGroupStart() and ncclGroupEnd() // Also, Rember to stream synchronize on the compute_stream at the end - MPI_CALL(MPI_Sendrecv(a_new + iy_start * nx, nx, MPI_REAL_TYPE, top, 0, + MPI_CALL(MPI_Sendrecv(a_new + iy_start * nx, nx, MPI_REAL_TYPE, top, 0, a_new + (iy_end * nx), nx, MPI_REAL_TYPE, bottom, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE)); MPI_CALL(MPI_Sendrecv(a_new + (iy_end - 1) * nx, nx, MPI_REAL_TYPE, bottom, 0, a_new, nx, - MPI_REAL_TYPE, top, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE)); - std::swap(a_new, a); + MPI_REAL_TYPE, top, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE)); + std::swap(a_new, a); } POP_RANGE @@ -292,7 +292,7 @@ int main(int argc, char* argv[]) { CUDA_RT_CALL(cudaStreamWaitEvent(push_stream, reset_l2norm_done, 0)); calculate_norm = (iter % nccheck) == 0 || (!csv && (iter % 100) == 0); - launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_start + 1), (iy_end - 1), nx, calculate_norm, + launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_start + 1), (iy_end - 1), nx, calculate_norm, compute_stream); launch_jacobi_kernel(a_new, a, l2_norm_d, iy_start, (iy_start + 1), nx, calculate_norm, @@ -312,16 +312,16 @@ int main(int argc, char* argv[]) { const int bottom = (rank + 1) % size; // Apply periodic boundary conditions - //TODO: Modify the lable for the RANGE, and replace MPI_Sendrecv with ncclSend and ncclRecv calls + //TODO: Modify the lable for the RANGE, and replace MPI_Sendrecv with ncclSend and ncclRecv calls // using the nccl communicator and push_stream. // Remember to use ncclGroupStart() and ncclGroupEnd() - PUSH_RANGE("MPI", 5) + PUSH_RANGE("MPI", 5) MPI_CALL(MPI_Sendrecv(a_new + iy_start * nx, nx, MPI_REAL_TYPE, top, 0, a_new + (iy_end * nx), nx, MPI_REAL_TYPE, bottom, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE)); MPI_CALL(MPI_Sendrecv(a_new + (iy_end - 1) * nx, nx, MPI_REAL_TYPE, bottom, 0, a_new, nx, MPI_REAL_TYPE, top, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE)); - CUDA_RT_CALL(cudaEventRecord(push_done, push_stream)); + CUDA_RT_CALL(cudaEventRecord(push_done, push_stream)); POP_RANGE if (calculate_norm) { @@ -366,9 +366,9 @@ int main(int argc, char* argv[]) { if (rank == 0 && result_correct) { if (csv) { - //TODO: Dont forget to change your output lable from mpi_overlap to nccl_overlap - printf("mpi_overlap, %d, %d, %d, %d, %d, 1, %f, %f\n", nx, ny, iter_max, nccheck, size, - (stop - start), runtime_serial); + //TODO: Dont forget to change your output lable from mpi_overlap to nccl_overlap + printf("mpi_overlap, %d, %d, %d, %d, %d, 1, %f, %f\n", nx, ny, iter_max, nccheck, size, + (stop - start), runtime_serial); } else { printf("Num GPUs: %d.\n", size); printf( diff --git a/08-H_NCCL_NVSHMEM/tasks/NVSHMEM/Instructions.ipynb b/08-H_NCCL_NVSHMEM/tasks/NVSHMEM/Instructions.ipynb index b6476ef..eb6077a 100644 --- a/08-H_NCCL_NVSHMEM/tasks/NVSHMEM/Instructions.ipynb +++ b/08-H_NCCL_NVSHMEM/tasks/NVSHMEM/Instructions.ipynb @@ -4,12 +4,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# SC21 Tutorial: Efficient Distributed GPU Programming for Exascale\n", + "# ISC22 Tutorial: Efficient Distributed GPU Programming for Exascale\n", "\n", - "- Time: Sunday, 14 November 2021 8AM - 5PM CST\n", - "- Location: *online*\n", + "- Time: Sunday, 29 May 2022 9AM - 6PM CEST\n", + "- Location: Hall Y6, Congress Center Hamburg (CCH)\n", "- Program Link:\n", - " https://sc21.supercomputing.org/presentation/?id=tut138&sess=sess188\n", + " https://app.swapcard.com/widget/event/isc-high-performance-2022/planning/UGxhbm5pbmdfODYxMTQ2\n", "\n", "## Hands-On 8-NVSHMEM: Host-initiated Communication with NVSHMEM\n", "\n", @@ -49,21 +49,22 @@ "\n", "Study the performance by glimpsing at the profile generated with\n", "`make profile`. For `make run` and `make profile` the environment\n", - "variable `NP` can be set to change the number of processes. \n", + "variable `NP` can be set to change the number of processes.\n", "\n", "#### Note\n", "\n", "The Slurm installation on JUWELS-Booster sets `CUDA_VISIBLE_DEVICES`\n", "automatically so that each spawned process only sees the GPU it should\n", - "use (see [GPU Devices](https://apps.fz-juelich.de/jsc/hps/juwels/booster-overview.html#gpu-devices)\n", + "use (see [GPU\n", + "Devices](https://apps.fz-juelich.de/jsc/hps/juwels/booster-overview.html#gpu-devices)\n", "in the JUWELS Booster Overview documentation). This is not supported for\n", "NVSHMEM. The automatic setting of `CUDA_VISIBLE_DEVICES` can be disabled\n", - "by setting `CUDA_VISIBLE_DEVICES=0,1,2,3` in the shell that executes srun.\n", - "With `CUDA_VISIBLE_DEVICES` set all spawned processes can see all GPUs\n", - "listed. This is automatically done for the `sanitize`, `run` and\n", - "`profile` make targets.\n", - "\n", - ] + "by setting `CUDA_VISIBLE_DEVICES=0,1,2,3` in the shell that executes\n", + "srun. With `CUDA_VISIBLE_DEVICES` set all spawned processes can see all\n", + "GPUs listed. This is automatically done for the `sanitize`, `run` and\n", + "`profile` make targets." + ], + "id": "4153946b-60de-494a-ad07-7ecb34a91c84" } ], "nbformat": 4, diff --git a/08-H_NCCL_NVSHMEM/tasks/NVSHMEM/Instructions.md b/08-H_NCCL_NVSHMEM/tasks/NVSHMEM/Instructions.md index 0100bbf..a263c17 100644 --- a/08-H_NCCL_NVSHMEM/tasks/NVSHMEM/Instructions.md +++ b/08-H_NCCL_NVSHMEM/tasks/NVSHMEM/Instructions.md @@ -1,9 +1,9 @@ -# SC21 Tutorial: Efficient Distributed GPU Programming for Exascale - -- Time: Sunday, 14 November 2021 8AM - 5PM CST -- Location: *online* -- Program Link: https://sc21.supercomputing.org/presentation/?id=tut138&sess=sess188 +# ISC22 Tutorial: Efficient Distributed GPU Programming for Exascale +- Time: Sunday, 29 May 2022 9AM - 6PM CEST +- Location: Hall Y6, Congress Center Hamburg (CCH) +- Program Link: + https://app.swapcard.com/widget/event/isc-high-performance-2022/planning/UGxhbm5pbmdfODYxMTQ2 ## Hands-On 8-NVSHMEM: Host-initiated Communication with NVSHMEM diff --git a/08-H_NCCL_NVSHMEM/tasks/NVSHMEM/Makefile b/08-H_NCCL_NVSHMEM/tasks/NVSHMEM/Makefile index 9e301fd..e711f37 100644 --- a/08-H_NCCL_NVSHMEM/tasks/NVSHMEM/Makefile +++ b/08-H_NCCL_NVSHMEM/tasks/NVSHMEM/Makefile @@ -1,6 +1,7 @@ # Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. NP ?= 4 NVCC=nvcc +N_D_C_VMM=1 #Enabled to hide warning and errors only found in NVSHMEM/2.5.0 to be fixed in next release JSC_SUBMIT_CMD ?= srun --cpu-bind=socket --gres=gpu:4 --ntasks-per-node 4 C_V_D ?= 0,1,2,3 CUDA_HOME ?= /usr/local/cuda @@ -25,7 +26,7 @@ else NVCC_FLAGS = -DHAVE_CUB endif NVCC_FLAGS += -dc -Xcompiler -fopenmp -lineinfo -DUSE_NVTX -lnvToolsExt $(GENCODE_FLAGS) -std=c++14 -I$(NVSHMEM_HOME)/include -I$(MPI_HOME)/include -NVCC_LDFLAGS = -ccbin=mpic++ -L$(NVSHMEM_HOME)/lib -lnvshmem -L$(MPI_HOME)/lib -lmpi -L$(CUDA_HOME)/lib64 -lcuda -lcudart -lnvToolsExt +NVCC_LDFLAGS = -ccbin=mpic++ -L$(NVSHMEM_HOME)/lib -lnvshmem -L$(MPI_HOME)/lib -lmpi -L$(CUDA_HOME)/lib64 -lcuda -lcudart -lnvToolsExt -lnvidia-ml jacobi: Makefile jacobi.cu $(NVCC) $(NVCC_FLAGS) jacobi.cu -c -o jacobi.o $(NVCC) $(GENCODE_FLAGS) jacobi.o -o jacobi $(NVCC_LDFLAGS) diff --git a/08-H_NCCL_NVSHMEM/tasks/NVSHMEM/jacobi.cu b/08-H_NCCL_NVSHMEM/tasks/NVSHMEM/jacobi.cu index 37ed848..0359d7e 100644 --- a/08-H_NCCL_NVSHMEM/tasks/NVSHMEM/jacobi.cu +++ b/08-H_NCCL_NVSHMEM/tasks/NVSHMEM/jacobi.cu @@ -295,19 +295,6 @@ int main(int argc, char* argv[]) { real* l2_norm_h; CUDA_RT_CALL(cudaMallocHost(&l2_norm_h, sizeof(real))); - PUSH_RANGE("MPI_Warmup", 5) - for (int i = 0; i < 10; ++i) { - const int top = rank > 0 ? rank - 1 : (size - 1); - const int bottom = (rank + 1) % size; - MPI_CALL(MPI_Sendrecv(a_new + iy_start * nx, nx, MPI_REAL_TYPE, top, 0, - a_new + (iy_end * nx), nx, MPI_REAL_TYPE, bottom, 0, MPI_COMM_WORLD, - MPI_STATUS_IGNORE)); - MPI_CALL(MPI_Sendrecv(a_new + (iy_end - 1) * nx, nx, MPI_REAL_TYPE, bottom, 0, a_new, nx, - MPI_REAL_TYPE, top, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE)); - std::swap(a_new, a); - } - POP_RANGE - CUDA_RT_CALL(cudaDeviceSynchronize()); if (!csv && 0 == rank) { @@ -332,21 +319,21 @@ int main(int argc, char* argv[]) { CUDA_RT_CALL(cudaStreamWaitEvent(push_stream, reset_l2norm_done, 0)); calculate_norm = (iter % nccheck) == 0 || (!csv && (iter % 100) == 0); - launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_start + 1), (iy_end - 1), nx, calculate_norm, compute_stream); + launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_start + 1), (iy_end - 1), nx, calculate_norm, compute_stream); - launch_jacobi_kernel(a_new, a, l2_norm_d, iy_start, (iy_start + 1), nx, calculate_norm, push_stream); - - launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_end - 1), iy_end, nx, calculate_norm, push_stream); + launch_jacobi_kernel(a_new, a, l2_norm_d, iy_start, (iy_start + 1), nx, calculate_norm, push_stream); + + launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_end - 1), iy_end, nx, calculate_norm, push_stream); - CUDA_RT_CALL(cudaEventRecord(push_prep_done, push_stream)); + CUDA_RT_CALL(cudaEventRecord(push_prep_done, push_stream)); if (calculate_norm) { - CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, push_prep_done, 0)); + CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, push_prep_done, 0)); CUDA_RT_CALL(cudaMemcpyAsync(l2_norm_h, l2_norm_d, sizeof(real), cudaMemcpyDeviceToHost, compute_stream)); } - //TODO: Replace MPI communication with Host initiated NVSHMEM calls + //TODO: Replace MPI communication with Host initiated NVSHMEM calls // Apply periodic boundary conditions PUSH_RANGE("MPI", 5) MPI_CALL(MPI_Sendrecv(a_new + iy_start * nx, nx, MPI_REAL_TYPE, top, 0, @@ -354,12 +341,12 @@ int main(int argc, char* argv[]) { MPI_STATUS_IGNORE)); MPI_CALL(MPI_Sendrecv(a_new + (iy_end - 1) * nx, nx, MPI_REAL_TYPE, bottom, 0, a_new, nx, MPI_REAL_TYPE, top, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE)); - CUDA_RT_CALL(cudaEventRecord(push_done, push_stream)); + CUDA_RT_CALL(cudaEventRecord(push_done, push_stream)); POP_RANGE CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, push_done, 0)); - //TODO: add necessary inter PE synchronization using the nvshmemx_barrier_all_on_stream(...) + //TODO: add necessary inter PE synchronization using the nvshmemx_barrier_all_on_stream(...) if (calculate_norm) { CUDA_RT_CALL(cudaStreamSynchronize(compute_stream)); diff --git a/09-L_Device-initiated_Communication_with_NVSHMEM/slides.pdf b/09-L_Device-initiated_Communication_with_NVSHMEM/slides.pdf index 05fb991..c12ae33 100644 Binary files a/09-L_Device-initiated_Communication_with_NVSHMEM/slides.pdf and b/09-L_Device-initiated_Communication_with_NVSHMEM/slides.pdf differ diff --git a/10-H_Device-initiated_Communication_with_NVSHMEM/.master/Instructions.ipynb b/10-H_Device-initiated_Communication_with_NVSHMEM/.master/Instructions.ipynb index 1f2b3b1..00b04f9 100644 --- a/10-H_Device-initiated_Communication_with_NVSHMEM/.master/Instructions.ipynb +++ b/10-H_Device-initiated_Communication_with_NVSHMEM/.master/Instructions.ipynb @@ -4,12 +4,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# SC21 Tutorial: Efficient Distributed GPU Programming for Exascale\n", + "# ISC22 Tutorial: Efficient Distributed GPU Programming for Exascale\n", "\n", - "- Time: Sunday, 14 November 2021 8AM - 5PM CST\n", - "- Location: *online*\n", + "- Time: Sunday, 29 May 2022 9AM - 6PM CEST\n", + "- Location: Hall Y6, Congress Center Hamburg (CCH)\n", "- Program Link:\n", - " https://sc21.supercomputing.org/presentation/?id=tut138&sess=sess188\n", + " https://app.swapcard.com/widget/event/isc-high-performance-2022/planning/UGxhbm5pbmdfODYxMTQ2\n", "\n", "## Hands-On 10: Device-initiated Communication with NVSHMEM\n", "\n", @@ -55,12 +55,13 @@ "\n", "The Slurm installation on JUWELS-Booster sets `CUDA_VISIBLE_DEVICES`\n", "automatically so that each spawned process only sees the GPU it should\n", - "use (see [GPU Devices](https://apps.fz-juelich.de/jsc/hps/juwels/booster-overview.html#gpu-devices)\n", + "use (see [GPU\n", + "Devices](https://apps.fz-juelich.de/jsc/hps/juwels/booster-overview.html#gpu-devices)\n", "in the JUWELS Booster Overview documentation). This is not supported for\n", "NVSHMEM. The automatic setting of `CUDA_VISIBLE_DEVICES` can be disabled\n", - "by setting `CUDA_VISIBLE_DEVICES=0,1,2,3` in the shell that executes srun.\n", - "With `CUDA_VISIBLE_DEVICES` set all spawned processes can see all GPUs\n", - "listed. This is automatically done for the `sanitize`, `run` and\n", + "by setting `CUDA_VISIBLE_DEVICES=0,1,2,3` in the shell that executes\n", + "srun. With `CUDA_VISIBLE_DEVICES` set all spawned processes can see all\n", + "GPUs listed. This is automatically done for the `sanitize`, `run` and\n", "`profile` make targets.\n", "\n", "### Advanced Task: Use `nvshmemx_float_put_nbi_block`\n", @@ -84,9 +85,12 @@ " variant in the [Multi GPU Programming Models Github\n", " repository](https://github.com/NVIDIA/multi-gpu-programming-models)\n", " implements the same strategy." - ] + ], + "id": "f7525123-132c-4d36-890e-9efe369db7be" } ], + "nbformat": 4, + "nbformat_minor": 5, "metadata": { "kernelspec": { "display_name": "Python 3", @@ -105,7 +109,5 @@ "pygments_lexer": "ipython3", "version": "3.6.8" } - }, - "nbformat": 4, - "nbformat_minor": 5 + } } diff --git a/10-H_Device-initiated_Communication_with_NVSHMEM/.master/Instructions.md b/10-H_Device-initiated_Communication_with_NVSHMEM/.master/Instructions.md index 2c5898d..bd8df5f 100644 --- a/10-H_Device-initiated_Communication_with_NVSHMEM/.master/Instructions.md +++ b/10-H_Device-initiated_Communication_with_NVSHMEM/.master/Instructions.md @@ -1,9 +1,9 @@ -# SC21 Tutorial: Efficient Distributed GPU Programming for Exascale - -- Time: Sunday, 14 November 2021 8AM - 5PM CST -- Location: *online* -- Program Link: https://sc21.supercomputing.org/presentation/?id=tut138&sess=sess188 +# ISC22 Tutorial: Efficient Distributed GPU Programming for Exascale +- Time: Sunday, 29 May 2022 9AM - 6PM CEST +- Location: Hall Y6, Congress Center Hamburg (CCH) +- Program Link: + https://app.swapcard.com/widget/event/isc-high-performance-2022/planning/UGxhbm5pbmdfODYxMTQ2 ## Hands-On 10: Device-initiated Communication with NVSHMEM diff --git a/10-H_Device-initiated_Communication_with_NVSHMEM/.master/Makefile b/10-H_Device-initiated_Communication_with_NVSHMEM/.master/Makefile index b3da6d1..1d83127 100644 --- a/10-H_Device-initiated_Communication_with_NVSHMEM/.master/Makefile +++ b/10-H_Device-initiated_Communication_with_NVSHMEM/.master/Makefile @@ -1,7 +1,7 @@ # Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. NP ?= 4 NVCC=nvcc -JSC_SUBMIT_CMD ?= srun --gres=gpu:4 --ntasks-per-node 4 +JSC_SUBMIT_CMD ?= srun --cpu-bind=socket --gres=gpu:4 --ntasks-per-node 4 C_V_D ?= 0,1,2,3 CUDA_HOME ?= /usr/local/cuda ifndef NVSHMEM_HOME @@ -25,7 +25,7 @@ else NVCC_FLAGS = -DHAVE_CUB endif NVCC_FLAGS += -dc -Xcompiler -fopenmp -lineinfo -DUSE_NVTX -lnvToolsExt $(GENCODE_FLAGS) -std=c++14 -I$(NVSHMEM_HOME)/include -I$(MPI_HOME)/include -NVCC_LDFLAGS = -ccbin=mpic++ -L$(NVSHMEM_HOME)/lib -lnvshmem -L$(MPI_HOME)/lib -lmpi -L$(CUDA_HOME)/lib64 -lcuda -lcudart -lnvToolsExt +NVCC_LDFLAGS = -ccbin=mpic++ -L$(NVSHMEM_HOME)/lib -lnvshmem -L$(MPI_HOME)/lib -lmpi -L$(CUDA_HOME)/lib64 -lcuda -lcudart -lnvToolsExt -lnvidia-ml jacobi: Makefile jacobi.cu $(NVCC) $(NVCC_FLAGS) jacobi.cu -c -o jacobi.o $(NVCC) $(GENCODE_FLAGS) jacobi.o -o jacobi $(NVCC_LDFLAGS) diff --git a/10-H_Device-initiated_Communication_with_NVSHMEM/.master/jacobi.cu b/10-H_Device-initiated_Communication_with_NVSHMEM/.master/jacobi.cu index 5bc4059..9829940 100644 --- a/10-H_Device-initiated_Communication_with_NVSHMEM/.master/jacobi.cu +++ b/10-H_Device-initiated_Communication_with_NVSHMEM/.master/jacobi.cu @@ -307,6 +307,9 @@ int main(int argc, char* argv[]) { real* l2_norm_h; CUDA_RT_CALL(cudaMallocHost(&l2_norm_h, sizeof(real))); + //TODO: Remove unnecessary MPI communication +#ifdef SOLUTION +#else PUSH_RANGE("MPI_Warmup", 5) for (int i = 0; i < 10; ++i) { const int top = rank > 0 ? rank - 1 : (size - 1); @@ -319,6 +322,7 @@ int main(int argc, char* argv[]) { std::swap(a_new, a); } POP_RANGE +#endif CUDA_RT_CALL(cudaDeviceSynchronize()); diff --git a/10-H_Device-initiated_Communication_with_NVSHMEM/solutions/Instructions.ipynb b/10-H_Device-initiated_Communication_with_NVSHMEM/solutions/Instructions.ipynb index 1f2b3b1..00b04f9 100644 --- a/10-H_Device-initiated_Communication_with_NVSHMEM/solutions/Instructions.ipynb +++ b/10-H_Device-initiated_Communication_with_NVSHMEM/solutions/Instructions.ipynb @@ -4,12 +4,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# SC21 Tutorial: Efficient Distributed GPU Programming for Exascale\n", + "# ISC22 Tutorial: Efficient Distributed GPU Programming for Exascale\n", "\n", - "- Time: Sunday, 14 November 2021 8AM - 5PM CST\n", - "- Location: *online*\n", + "- Time: Sunday, 29 May 2022 9AM - 6PM CEST\n", + "- Location: Hall Y6, Congress Center Hamburg (CCH)\n", "- Program Link:\n", - " https://sc21.supercomputing.org/presentation/?id=tut138&sess=sess188\n", + " https://app.swapcard.com/widget/event/isc-high-performance-2022/planning/UGxhbm5pbmdfODYxMTQ2\n", "\n", "## Hands-On 10: Device-initiated Communication with NVSHMEM\n", "\n", @@ -55,12 +55,13 @@ "\n", "The Slurm installation on JUWELS-Booster sets `CUDA_VISIBLE_DEVICES`\n", "automatically so that each spawned process only sees the GPU it should\n", - "use (see [GPU Devices](https://apps.fz-juelich.de/jsc/hps/juwels/booster-overview.html#gpu-devices)\n", + "use (see [GPU\n", + "Devices](https://apps.fz-juelich.de/jsc/hps/juwels/booster-overview.html#gpu-devices)\n", "in the JUWELS Booster Overview documentation). This is not supported for\n", "NVSHMEM. The automatic setting of `CUDA_VISIBLE_DEVICES` can be disabled\n", - "by setting `CUDA_VISIBLE_DEVICES=0,1,2,3` in the shell that executes srun.\n", - "With `CUDA_VISIBLE_DEVICES` set all spawned processes can see all GPUs\n", - "listed. This is automatically done for the `sanitize`, `run` and\n", + "by setting `CUDA_VISIBLE_DEVICES=0,1,2,3` in the shell that executes\n", + "srun. With `CUDA_VISIBLE_DEVICES` set all spawned processes can see all\n", + "GPUs listed. This is automatically done for the `sanitize`, `run` and\n", "`profile` make targets.\n", "\n", "### Advanced Task: Use `nvshmemx_float_put_nbi_block`\n", @@ -84,9 +85,12 @@ " variant in the [Multi GPU Programming Models Github\n", " repository](https://github.com/NVIDIA/multi-gpu-programming-models)\n", " implements the same strategy." - ] + ], + "id": "f7525123-132c-4d36-890e-9efe369db7be" } ], + "nbformat": 4, + "nbformat_minor": 5, "metadata": { "kernelspec": { "display_name": "Python 3", @@ -105,7 +109,5 @@ "pygments_lexer": "ipython3", "version": "3.6.8" } - }, - "nbformat": 4, - "nbformat_minor": 5 + } } diff --git a/10-H_Device-initiated_Communication_with_NVSHMEM/solutions/Instructions.md b/10-H_Device-initiated_Communication_with_NVSHMEM/solutions/Instructions.md index 2c5898d..bd8df5f 100644 --- a/10-H_Device-initiated_Communication_with_NVSHMEM/solutions/Instructions.md +++ b/10-H_Device-initiated_Communication_with_NVSHMEM/solutions/Instructions.md @@ -1,9 +1,9 @@ -# SC21 Tutorial: Efficient Distributed GPU Programming for Exascale - -- Time: Sunday, 14 November 2021 8AM - 5PM CST -- Location: *online* -- Program Link: https://sc21.supercomputing.org/presentation/?id=tut138&sess=sess188 +# ISC22 Tutorial: Efficient Distributed GPU Programming for Exascale +- Time: Sunday, 29 May 2022 9AM - 6PM CEST +- Location: Hall Y6, Congress Center Hamburg (CCH) +- Program Link: + https://app.swapcard.com/widget/event/isc-high-performance-2022/planning/UGxhbm5pbmdfODYxMTQ2 ## Hands-On 10: Device-initiated Communication with NVSHMEM diff --git a/10-H_Device-initiated_Communication_with_NVSHMEM/solutions/Makefile b/10-H_Device-initiated_Communication_with_NVSHMEM/solutions/Makefile index b3da6d1..1d83127 100644 --- a/10-H_Device-initiated_Communication_with_NVSHMEM/solutions/Makefile +++ b/10-H_Device-initiated_Communication_with_NVSHMEM/solutions/Makefile @@ -1,7 +1,7 @@ # Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. NP ?= 4 NVCC=nvcc -JSC_SUBMIT_CMD ?= srun --gres=gpu:4 --ntasks-per-node 4 +JSC_SUBMIT_CMD ?= srun --cpu-bind=socket --gres=gpu:4 --ntasks-per-node 4 C_V_D ?= 0,1,2,3 CUDA_HOME ?= /usr/local/cuda ifndef NVSHMEM_HOME @@ -25,7 +25,7 @@ else NVCC_FLAGS = -DHAVE_CUB endif NVCC_FLAGS += -dc -Xcompiler -fopenmp -lineinfo -DUSE_NVTX -lnvToolsExt $(GENCODE_FLAGS) -std=c++14 -I$(NVSHMEM_HOME)/include -I$(MPI_HOME)/include -NVCC_LDFLAGS = -ccbin=mpic++ -L$(NVSHMEM_HOME)/lib -lnvshmem -L$(MPI_HOME)/lib -lmpi -L$(CUDA_HOME)/lib64 -lcuda -lcudart -lnvToolsExt +NVCC_LDFLAGS = -ccbin=mpic++ -L$(NVSHMEM_HOME)/lib -lnvshmem -L$(MPI_HOME)/lib -lmpi -L$(CUDA_HOME)/lib64 -lcuda -lcudart -lnvToolsExt -lnvidia-ml jacobi: Makefile jacobi.cu $(NVCC) $(NVCC_FLAGS) jacobi.cu -c -o jacobi.o $(NVCC) $(GENCODE_FLAGS) jacobi.o -o jacobi $(NVCC_LDFLAGS) diff --git a/10-H_Device-initiated_Communication_with_NVSHMEM/solutions/jacobi.cu b/10-H_Device-initiated_Communication_with_NVSHMEM/solutions/jacobi.cu index aa2539c..04c3b63 100644 --- a/10-H_Device-initiated_Communication_with_NVSHMEM/solutions/jacobi.cu +++ b/10-H_Device-initiated_Communication_with_NVSHMEM/solutions/jacobi.cu @@ -288,18 +288,7 @@ int main(int argc, char* argv[]) { real* l2_norm_h; CUDA_RT_CALL(cudaMallocHost(&l2_norm_h, sizeof(real))); - PUSH_RANGE("MPI_Warmup", 5) - for (int i = 0; i < 10; ++i) { - const int top = rank > 0 ? rank - 1 : (size - 1); - const int bottom = (rank + 1) % size; - MPI_CALL(MPI_Sendrecv(a_new + iy_start * nx, nx, MPI_REAL_TYPE, top, 0, - a_new + (iy_end * nx), nx, MPI_REAL_TYPE, bottom, 0, MPI_COMM_WORLD, - MPI_STATUS_IGNORE)); - MPI_CALL(MPI_Sendrecv(a_new + (iy_end - 1) * nx, nx, MPI_REAL_TYPE, bottom, 0, a_new, nx, - MPI_REAL_TYPE, top, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE)); - std::swap(a_new, a); - } - POP_RANGE + //TODO: Remove unnecessary MPI communication CUDA_RT_CALL(cudaDeviceSynchronize()); diff --git a/10-H_Device-initiated_Communication_with_NVSHMEM/tasks/Instructions.ipynb b/10-H_Device-initiated_Communication_with_NVSHMEM/tasks/Instructions.ipynb index 1f2b3b1..00b04f9 100644 --- a/10-H_Device-initiated_Communication_with_NVSHMEM/tasks/Instructions.ipynb +++ b/10-H_Device-initiated_Communication_with_NVSHMEM/tasks/Instructions.ipynb @@ -4,12 +4,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# SC21 Tutorial: Efficient Distributed GPU Programming for Exascale\n", + "# ISC22 Tutorial: Efficient Distributed GPU Programming for Exascale\n", "\n", - "- Time: Sunday, 14 November 2021 8AM - 5PM CST\n", - "- Location: *online*\n", + "- Time: Sunday, 29 May 2022 9AM - 6PM CEST\n", + "- Location: Hall Y6, Congress Center Hamburg (CCH)\n", "- Program Link:\n", - " https://sc21.supercomputing.org/presentation/?id=tut138&sess=sess188\n", + " https://app.swapcard.com/widget/event/isc-high-performance-2022/planning/UGxhbm5pbmdfODYxMTQ2\n", "\n", "## Hands-On 10: Device-initiated Communication with NVSHMEM\n", "\n", @@ -55,12 +55,13 @@ "\n", "The Slurm installation on JUWELS-Booster sets `CUDA_VISIBLE_DEVICES`\n", "automatically so that each spawned process only sees the GPU it should\n", - "use (see [GPU Devices](https://apps.fz-juelich.de/jsc/hps/juwels/booster-overview.html#gpu-devices)\n", + "use (see [GPU\n", + "Devices](https://apps.fz-juelich.de/jsc/hps/juwels/booster-overview.html#gpu-devices)\n", "in the JUWELS Booster Overview documentation). This is not supported for\n", "NVSHMEM. The automatic setting of `CUDA_VISIBLE_DEVICES` can be disabled\n", - "by setting `CUDA_VISIBLE_DEVICES=0,1,2,3` in the shell that executes srun.\n", - "With `CUDA_VISIBLE_DEVICES` set all spawned processes can see all GPUs\n", - "listed. This is automatically done for the `sanitize`, `run` and\n", + "by setting `CUDA_VISIBLE_DEVICES=0,1,2,3` in the shell that executes\n", + "srun. With `CUDA_VISIBLE_DEVICES` set all spawned processes can see all\n", + "GPUs listed. This is automatically done for the `sanitize`, `run` and\n", "`profile` make targets.\n", "\n", "### Advanced Task: Use `nvshmemx_float_put_nbi_block`\n", @@ -84,9 +85,12 @@ " variant in the [Multi GPU Programming Models Github\n", " repository](https://github.com/NVIDIA/multi-gpu-programming-models)\n", " implements the same strategy." - ] + ], + "id": "f7525123-132c-4d36-890e-9efe369db7be" } ], + "nbformat": 4, + "nbformat_minor": 5, "metadata": { "kernelspec": { "display_name": "Python 3", @@ -105,7 +109,5 @@ "pygments_lexer": "ipython3", "version": "3.6.8" } - }, - "nbformat": 4, - "nbformat_minor": 5 + } } diff --git a/10-H_Device-initiated_Communication_with_NVSHMEM/tasks/Instructions.md b/10-H_Device-initiated_Communication_with_NVSHMEM/tasks/Instructions.md index 2c5898d..bd8df5f 100644 --- a/10-H_Device-initiated_Communication_with_NVSHMEM/tasks/Instructions.md +++ b/10-H_Device-initiated_Communication_with_NVSHMEM/tasks/Instructions.md @@ -1,9 +1,9 @@ -# SC21 Tutorial: Efficient Distributed GPU Programming for Exascale - -- Time: Sunday, 14 November 2021 8AM - 5PM CST -- Location: *online* -- Program Link: https://sc21.supercomputing.org/presentation/?id=tut138&sess=sess188 +# ISC22 Tutorial: Efficient Distributed GPU Programming for Exascale +- Time: Sunday, 29 May 2022 9AM - 6PM CEST +- Location: Hall Y6, Congress Center Hamburg (CCH) +- Program Link: + https://app.swapcard.com/widget/event/isc-high-performance-2022/planning/UGxhbm5pbmdfODYxMTQ2 ## Hands-On 10: Device-initiated Communication with NVSHMEM diff --git a/10-H_Device-initiated_Communication_with_NVSHMEM/tasks/Makefile b/10-H_Device-initiated_Communication_with_NVSHMEM/tasks/Makefile index b3da6d1..1d83127 100644 --- a/10-H_Device-initiated_Communication_with_NVSHMEM/tasks/Makefile +++ b/10-H_Device-initiated_Communication_with_NVSHMEM/tasks/Makefile @@ -1,7 +1,7 @@ # Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. NP ?= 4 NVCC=nvcc -JSC_SUBMIT_CMD ?= srun --gres=gpu:4 --ntasks-per-node 4 +JSC_SUBMIT_CMD ?= srun --cpu-bind=socket --gres=gpu:4 --ntasks-per-node 4 C_V_D ?= 0,1,2,3 CUDA_HOME ?= /usr/local/cuda ifndef NVSHMEM_HOME @@ -25,7 +25,7 @@ else NVCC_FLAGS = -DHAVE_CUB endif NVCC_FLAGS += -dc -Xcompiler -fopenmp -lineinfo -DUSE_NVTX -lnvToolsExt $(GENCODE_FLAGS) -std=c++14 -I$(NVSHMEM_HOME)/include -I$(MPI_HOME)/include -NVCC_LDFLAGS = -ccbin=mpic++ -L$(NVSHMEM_HOME)/lib -lnvshmem -L$(MPI_HOME)/lib -lmpi -L$(CUDA_HOME)/lib64 -lcuda -lcudart -lnvToolsExt +NVCC_LDFLAGS = -ccbin=mpic++ -L$(NVSHMEM_HOME)/lib -lnvshmem -L$(MPI_HOME)/lib -lmpi -L$(CUDA_HOME)/lib64 -lcuda -lcudart -lnvToolsExt -lnvidia-ml jacobi: Makefile jacobi.cu $(NVCC) $(NVCC_FLAGS) jacobi.cu -c -o jacobi.o $(NVCC) $(GENCODE_FLAGS) jacobi.o -o jacobi $(NVCC_LDFLAGS) diff --git a/10-H_Device-initiated_Communication_with_NVSHMEM/tasks/jacobi.cu b/10-H_Device-initiated_Communication_with_NVSHMEM/tasks/jacobi.cu index 2a71d15..a242329 100644 --- a/10-H_Device-initiated_Communication_with_NVSHMEM/tasks/jacobi.cu +++ b/10-H_Device-initiated_Communication_with_NVSHMEM/tasks/jacobi.cu @@ -271,6 +271,7 @@ int main(int argc, char* argv[]) { real* l2_norm_h; CUDA_RT_CALL(cudaMallocHost(&l2_norm_h, sizeof(real))); + //TODO: Remove unnecessary MPI communication PUSH_RANGE("MPI_Warmup", 5) for (int i = 0; i < 10; ++i) { const int top = rank > 0 ? rank - 1 : (size - 1); diff --git a/11-L_Summary_Advanced/slides.pdf b/11-L_Summary_Advanced/slides.pdf index 0de6ef3..f3e27c1 100644 Binary files a/11-L_Summary_Advanced/slides.pdf and b/11-L_Summary_Advanced/slides.pdf differ diff --git a/README.md b/README.md index 0b62405..f64dc6a 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,14 @@ -# SC21 Tutorial: Efficient Distributed GPU Programming for Exascale +# ISC22 Tutorial: Efficient Distributed GPU Programming for Exascale -Repository with talks and exercises of our [Efficient Distributed GPU Programming for Exascale](https://sc21.supercomputing.org/presentation/?id=tut138&sess=sess188). The tutorial was held virtually, partly recorded, partly live, using JUWELS Booster at JSC. +[](https://doi.org/10.5281/zenodo.5745505) (*old*) + +Repository with talks and exercises of our [Efficient GPU Programming for Exascale](https://app.swapcard.com/widget/event/isc-high-performance-2022/planning/UGxhbm5pbmdfODYxMTQ2) tutorial. ## Coordinates -* Date: 14 November 2021 -* Occasion: SC21 Tutorial -* Tutors: Simon Garcia (BSC), Andreas Herten (JSC), Markus Hrywniak (NVIDIA), Jiri Kraus (NVIDIA), Lena Oden (Uni Hagen) +* Date: 29 May 2022 +* Occasion: ISC22 Tutorial +* Tutors: Andreas Herten (JSC), Markus Hrywniak (NVIDIA), Jiri Kraus (NVIDIA), Lena Oden (Uni Hagen) (and Simon Garcia (BSC), helping from afar) ## Setup @@ -33,16 +35,16 @@ The supercomputer used for the exercises is [JUWELS Booster](https://apps.fz-jue Visual onboarding instructions can be found in the subfolder of the according lecture, `01b-H-Onboarding/`. Here follows the textual description: * Register for an account at [JuDoor](https://judoor.fz-juelich.de/login) -* Sign-up for the [`training2125` project](https://judoor.fz-juelich.de/projects/join/training2125) +* Sign-up for the [`training2216` project](https://judoor.fz-juelich.de/projects/join/training2216) * Accept the Usage Agreement of JUWELS * Wait for wheels to turn as your information is pushed through the systems (about 15 minutes) * Access JUWELS Booster via [JSC's Jupyter portal](https://jupyter-jsc.fz-juelich.de/) -* Create a Jupyter v2 instance using `LoginNodeBooster` and the `training2125` allocation on JUWELS +* Create a Jupyter v2 instance using `LoginNodeBooster` and the `training2216` allocation on JUWELS * When started, launch a browser-based Shell in Jupyter * Source the course environment to introduce commands and helper script to environment ``` - source $PROJECT_training2125/env.sh + source $PROJECT_training2216/env.sh ``` * Sync course material to your home directory with `jsc-material-sync`. -You can also access JSC's facilities via SSH. In that case you need to add your SSH key through JuDoor. You need to restrict access from certain IPs/IP ranges via the `from` clause, as explained [in the documentation](https://apps.fz-juelich.de/jsc/hps/juwels/access.html#ssh-login). We recommend using Jupyter JSC for its simplicity, especially during such a short day that is the tutorial day. +You can also access JSC's facilities via SSH. In that case you need to add your SSH key through JuDoor. You need to restrict access from certain IPs/IP ranges via the `from` clause, as explained [in the documentation](https://apps.fz-juelich.de/jsc/hps/juwels/access.html#ssh-login). We recommend using Jupyter JSC for its simplicity, especially during such a short day that is the tutorial day. \ No newline at end of file