diff --git a/.archive.mk b/.archive.mk index 4887b87..27ce859 100755 --- a/.archive.mk +++ b/.archive.mk @@ -6,16 +6,16 @@ # Changelog: # * Nov 2022: The archive is extracted again, then slides.pdf is removed if a patched slides-sc22.pdf is found (which includes an SC22 slide 0 title slide); and then repackaged .PHONY: all -all: tut102-multi-gpu.tar.gz +all: tut140-multi-gpu.tar.gz -SOURCES=$(shell gfind . -maxdepth 1 -mindepth 1 -not -path "./.*" -not -name "tut102-multi-gpu.tar.gz" -printf '%P\n' | sort -h) +SOURCES=$(shell gfind . -maxdepth 1 -mindepth 1 -not -path "./.*" -not -name "tut140-multi-gpu.tar.gz" -printf '%P\n' | sort -h) -tut102-multi-gpu.tar.gz: $(shell find . -not -name "tut102-multi-gpu.tar.gz") +tut140-multi-gpu.tar.gz: $(shell find . -not -name "tut140-multi-gpu.tar.gz") sed -i '1 i***Please check GitHub repo for latest version of slides: https://github.com/FZJ-JSC/tutorial-multi-gpu/ ***\n' README.md - tar czf $@ --transform 's,^,SC22-tut102-Multi-GPU/,' --exclude=".*" $(SOURCES) + tar czf $@ --transform 's,^,SC23-tut140-Multi-GPU/,' --exclude=".*" $(SOURCES) tar xf $@ rm $@ - find SC22-tut102-Multi-GPU/ -not -path './.*' -iname 'slides-*.pdf' -execdir rm slides.pdf \; - tar czf $@ SC22-tut102-Multi-GPU - rm -rf SC22-tut102-Multi-GPU + find SC23-tut140-Multi-GPU/ -not -path './.*' -iname 'slides-*.pdf' -execdir rm slides.pdf \; + tar czf $@ SC23-tut140-Multi-GPU + rm -rf SC23-tut140-Multi-GPU sed -i '1,2d' README.md \ No newline at end of file diff --git a/.etc/.set-facl-permissions.sh b/.etc/.set-facl-permissions.sh index 33b48bc..b316957 100644 --- a/.etc/.set-facl-permissions.sh +++ b/.etc/.set-facl-permissions.sh @@ -3,8 +3,8 @@ set -x for user in haghighimood1 kraus1 hrywniak1 oden1 garciadegonzalo1 badwaik1 john2; do - setfacl -m u:$user:rwx -R $PROJECT_training2313/common/ - setfacl -m u:$user:rwx -R $PROJECT_training2313/env.sh + setfacl -m u:$user:rwx -R $PROJECT_training2332/common/ + setfacl -m u:$user:rwx -R $PROJECT_training2332/env.sh done set +x \ No newline at end of file diff --git a/.etc/deploy-material.sh b/.etc/deploy-material.sh index 81cf58d..5c3ef1e 100755 --- a/.etc/deploy-material.sh +++ b/.etc/deploy-material.sh @@ -1 +1 @@ -rsync --archive --exclude="*minified.pdf" --exclude="tut*" --exclude=".*" --verbose ../ judac:/p/project/training2313/common/material/ +rsync --archive --exclude="*minified.pdf" --exclude="tut*" --exclude=".*" --exclude="*-sc*.pdf" --verbose ../ judac:/p/project/training2332/common/material/ diff --git a/.etc/deploy.sh b/.etc/deploy.sh index 363675f..49dcb68 100755 --- a/.etc/deploy.sh +++ b/.etc/deploy.sh @@ -1 +1 @@ -rsync --archive --exclude="deploy.sh" --verbose . judac:/p/project/training2313/common/environment/ +rsync --archive --exclude="deploy.sh" --exclude="raw/" --exclude="sc23-titleslides/" --verbose . judac:/p/project/training2332/common/environment/ diff --git a/.etc/instructions-header.md b/.etc/instructions-header.md index 4267766..b73da92 100644 --- a/.etc/instructions-header.md +++ b/.etc/instructions-header.md @@ -1,7 +1,6 @@ -# ISC23 Tutorial: Efficient Distributed GPU Programming for Exascale +# SC23 Tutorial: Efficient Distributed GPU Programming for Exascale -- Time: Sunday, 21 May 2023 9:00 - 18:00 CEST -- Location: Hall Y8, Congress Center Hamburg, Germany +- Time: Monday, 13 November 2023 8:30 - 17:00 MST +- Location: 405, Denver Congress Centre, USA - Program Link: - https://app.swapcard.com/widget/event/isc-high-performance-2023/planning/UGxhbm5pbmdfMTIyMDc5OA== - + https://sc23.supercomputing.org/presentation/?id=tut140&sess=sess242 diff --git a/.etc/jsccourse-bashrc.sh b/.etc/jsccourse-bashrc.sh index 57ae415..be3571c 100644 --- a/.etc/jsccourse-bashrc.sh +++ b/.etc/jsccourse-bashrc.sh @@ -11,10 +11,10 @@ # Andreas Herten, >2017 ################################################ if [ -z "$_JSCCOURSE_ENV_SOURCED" ]; then - project="training2313" + project="training2332" export JSCCOURSE_DIR_GROUP=/p/project/$project - export JSCCOURSE_DIR_LOCAL=${JSCCOURSE_DIR_LOCAL_BASE:-$HOME}/ISC23-Multi-GPU-Tutorial + export JSCCOURSE_DIR_LOCAL=${JSCCOURSE_DIR_LOCAL_BASE:-$HOME}/SC23-Multi-GPU-Tutorial export _JSCCOURSE_ENV_SOURCED="$(date)" export C_V_D="0,1,2,3" @@ -23,8 +23,8 @@ if [ -z "$_JSCCOURSE_ENV_SOURCED" ]; then res="" currentday=$(date +%d) - if [[ "$currentday" == "21" ]]; then - res="--reservation isc23-tutorial-2023-05-21" + if [[ "$currentday" == "13" ]]; then + res="--reservation sc-tutorial" fi export SLURM_NTASKS=1 @@ -56,9 +56,10 @@ if [ -z "$_JSCCOURSE_ENV_SOURCED" ]; then ;; esac - export JSC_BATCH_CONFIG="$res --partition ${partition} --cpu-bind=sockets --gres=gpu:$ngpus $JSC_SUBMIT_CMD_SYSTEM_SPECIFIC_OPTIONS --time 0:10:00" + export JSC_BATCH_CONFIG="$res --partition ${partition} --disable-dcgm --gres=gpu:$ngpus $JSC_SUBMIT_CMD_SYSTEM_SPECIFIC_OPTIONS --time 0:10:00" export JSC_ALLOC_CMD="salloc $JSC_BATCH_CONFIG" - export JSC_SUBMIT_CMD="srun $JSC_BATCH_CONFIG --pty" + # export JSC_SUBMIT_CMD="srun $JSC_BATCH_CONFIG --pty" + export JSC_SUBMIT_CMD="salloc $JSC_BATCH_CONFIG srun --cpu-bind=sockets --pty" export _JSC_MATERIAL_SYNC="rsync --archive --update --exclude='.*' --exclude='.*/' $JSCCOURSE_DIR_GROUP/common/material/ $JSCCOURSE_DIR_LOCAL" export _JSC_MATERIAL_SYNC_FORCE="rsync --archive --exclude='.*' --exclude='.*/' $JSCCOURSE_DIR_GROUP/common/material/ $JSCCOURSE_DIR_LOCAL" @@ -108,7 +109,7 @@ if [[ $- =~ "i" ]]; then echo "" echo "*******************************************************************************" - echo " Welcome to the ISC23 Tutorial on Multi-GPU Computing for Exascale! " + echo " Welcome to the SC23 Tutorial on Multi-GPU Computing for Exascale! " # echo " A default call to get a batch system allocation is stored in \$JSC_ALLOC_CMD!" # echo " Use it with \`eval \$JSC_ALLOC_CMD\`. The value of \$JSC_ALLOC_CMD is:" # echo -n " " diff --git a/.etc/modules.sh b/.etc/modules.sh index d17a67a..12f3e35 100644 --- a/.etc/modules.sh +++ b/.etc/modules.sh @@ -2,9 +2,9 @@ module purge module load Stages/2023 module load GCC/11.3.0 module load CUDA/11.7 -module load ParaStationMPI/5.7.0-1 +module load ParaStationMPI/5.7.1-1 module load NCCL/default-CUDA-11.7 -module load NVSHMEM/2.5.0 +module load NVSHMEM/2.8.0-CUDA-11.7 module load Nsight-Systems/2023.2.1 module load MPI-settings/CUDA # module use $JSCCOURSE_DIR_GROUP/common/modulefiles \ No newline at end of file diff --git a/.etc/printout.tex b/.etc/printout.tex index f4eb68a..5c5d62c 100644 --- a/.etc/printout.tex +++ b/.etc/printout.tex @@ -52,12 +52,12 @@ \vspace*{1\baselineskip} \begin{enumerate} - \item Create \highlight{\emph{JuDoor}} account, \highlight{join} \texttt{training2313} project: \href{https://go.fzj.de/mg-jd}{\texttt{go.fzj.de/mg-jd}} + \item Create \highlight{\emph{JuDoor}} account, \highlight{join} \texttt{training2332} project: \href{https://go.fzj.de/mg-jd}{\texttt{go.fzj.de/mg-jd}} \item Fill \highlight{usage agreement}; wait at least 15 min for synchronization \item Login to \highlight{Jupyter}: \href{https://jupyter-jsc.fz-juelich.de}{\texttt{jupyter-jsc.fz-juelich.de}} (JuDoor credentials) \item Create new Jupyter \highlight{instance} on \texttt{LoginNode\textbf{Booster}} with training project \item Start Jupyter Terminal, \highlight{source} course environment\\ - \verb|source $PROJECT_training2313/env.sh| + \verb|source $PROJECT_training2332/env.sh| \item \highlight{Sync} course material: \verb|jsc-material-sync| \end{enumerate} \end{document} \ No newline at end of file diff --git a/.etc/sc22-titleslides/prelude_slides.mk b/.etc/sc22-titleslides/prelude_slides.mk deleted file mode 100755 index 4554893..0000000 --- a/.etc/sc22-titleslides/prelude_slides.mk +++ /dev/null @@ -1,37 +0,0 @@ -#!/usr/bin/make -f -# LAUNCH FROM THIS FOLDER - -OUTPUT=../../01-L_Introduction_Overview/slides-sc22.pdf ../../01b-H_Onboarding/slides-sc22.pdf ../../04-L_Performance_and_debugging_tools/slides-sc22.pdf ../../11-L_Summary_Advanced/slides-sc22.pdf -# OUTPUT=../../01-L_Introduction_Overview/slides-sc22.pdf ../../01b-H_Onboarding/slides-sc22.pdf ../../04-L_Performance_and_debugging_tools/slides-sc22.pdf ../../05-L_Optimization_techniques_for_multi-GPU_applications/slides-sc22.pdf ../../09-L_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/slides-sc22.pdf ../../11-L_Summary_Advanced/slides-sc22.pdf - -.PHONY: all -all: $(OUTPUT) -MYTMPDIR:=$(shell mktemp -d) - -title-slide.in.tex: title-slide.tex - cat $< | \ - sed 's#INSERT TITLE HERE#((( title )))#' | \ - sed 's#Insert Author Here#((( author )))#' > \ - $@ - -title-slide.01.tex ../../01-L_Introduction_Overview/slides-sc22.pdf: SESSIONKEY=01 -title-slide.01b.tex ../../01b-H_Onboarding/slides-sc22.pdf: SESSIONKEY=01b -title-slide.04.tex ../../04-L_Performance_and_debugging_tools/slides-sc22.pdf: SESSIONKEY=04 -title-slide.05.tex ../../05-L_Optimization_techniques_for_multi-GPU_applications/slides-sc22.pdf: SESSIONKEY=05 -title-slide.09.tex ../../09-L_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/slides-sc22.pdf: SESSIONKEY=09 -title-slide.11.tex ../../11-L_Summary_Advanced/slides-sc22.pdf: SESSIONKEY=11 -title-slide.01.tex title-slide.01b.tex title-slide.04.tex title-slide.05.tex title-slide.09.tex title-slide.11.tex: title-slide.tex - python3 gen-titleslide.py --author "$(shell cat sessions.yml | yq .$(SESSIONKEY).author)" --title "$(shell cat sessions.yml | yq .$(SESSIONKEY).title)" --out "$@" - -../../01-L_Introduction_Overview/slides-sc22.pdf: BASEDECK=../../01-L_Introduction_Overview/slides.pdf -../../01b-H_Onboarding/slides-sc22.pdf: BASEDECK=../../01b-H_Onboarding/slides.pdf -../../04-L_Performance_and_debugging_tools/slides-sc22.pdf: BASEDECK=../../04-L_Performance_and_debugging_tools/slides.pdf -../../05-L_Optimization_techniques_for_multi-GPU_applications/slides-sc22.pdf: BASEDECK=../../05-L_Optimization_techniques_for_multi-GPU_applications/slides.pdf -../../09-L_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/slides-sc22.pdf: BASEDECK=../../09-L_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/slides.pdf -../../11-L_Summary_Advanced/slides-sc22.pdf: BASEDECK=../../11-L_Summary_Advanced/slides.pdf - -.SECONDEXPANSION: -%-sc22.pdf: %.pdf title-slide.$$(SESSIONKEY).tex $(BASEDECK) - latexmk -output-directory=$(MYTMPDIR) -jobname=${SESSIONKEY} -pdfxe title-slide.$(SESSIONKEY).tex - cp $(MYTMPDIR)/${SESSIONKEY}.pdf title-slide.$(SESSIONKEY).pdf - mutool merge -o $@ title-slide.$(SESSIONKEY).pdf 0 $(BASEDECK) \ No newline at end of file diff --git a/.etc/sc22-titleslides/sc-background.png b/.etc/sc22-titleslides/sc-background.png deleted file mode 100644 index 5d331f3..0000000 Binary files a/.etc/sc22-titleslides/sc-background.png and /dev/null differ diff --git a/.etc/sc22-titleslides/.gitignore b/.etc/sc23-titleslides/.gitignore similarity index 100% rename from .etc/sc22-titleslides/.gitignore rename to .etc/sc23-titleslides/.gitignore diff --git a/.etc/sc22-titleslides/README.md b/.etc/sc23-titleslides/README.md similarity index 100% rename from .etc/sc22-titleslides/README.md rename to .etc/sc23-titleslides/README.md diff --git a/.etc/sc22-titleslides/gen-titleslide.py b/.etc/sc23-titleslides/gen-titleslide.py similarity index 100% rename from .etc/sc22-titleslides/gen-titleslide.py rename to .etc/sc23-titleslides/gen-titleslide.py diff --git a/.etc/sc23-titleslides/prelude_slides.mk b/.etc/sc23-titleslides/prelude_slides.mk new file mode 100755 index 0000000..2d0d3a6 --- /dev/null +++ b/.etc/sc23-titleslides/prelude_slides.mk @@ -0,0 +1,44 @@ +#!/usr/bin/make -f +# LAUNCH FROM THIS FOLDER + +# OUTPUT=../../01-L_Introduction_Overview/slides-sc23.pdf ../../01b-H_Onboarding/slides-sc23.pdf ../../04-L_Performance_and_debugging_tools/slides-sc23.pdf ../../11-L_Summary_Advanced/slides-sc23.pdf +OUTPUT=../../01-L_Introduction_Overview/slides-sc23.pdf ../../01b-H_Onboarding/slides-sc23.pdf ../../02-L_Introduction_to_MPI-Distributed_Computing_with_GPUs/slides-sc23.pdf ../../04-L_Performance_and_debugging_tools/slides-sc23.pdf ../../05-L_Optimization_techniques_for_multi-GPU_applications/slides-sc23.pdf ../../07-L_Overview_of_NCCL_and_NVSHMEM_in_MPI_Programs/slides-sc23.pdf ../../09-L_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/slides-sc23.pdf ../../11-L_Summary_Advanced/slides-sc23.pdf + +.PHONY: all +all: $(OUTPUT) +MYTMPDIR:=$(shell mktemp -d) + +title-slide.in.tex: title-slide.tex + cat $< | \ + sed 's#INSERT TITLE HERE#((( title )))#' | \ + sed 's#Insert Author Here#((( author )))#' > \ + $@ + +title-slide.01.tex ../../01-L_Introduction_Overview/slides-sc23.pdf: SESSIONKEY=01 +title-slide.01b.tex ../../01b-H_Onboarding/slides-sc23.pdf: SESSIONKEY=01b +title-slide.02.tex ../../02-L_Introduction_to_MPI-Distributed_Computing_with_GPUs/slides-sc23.pdf: SESSIONKEY=02 +title-slide.04.tex ../../04-L_Performance_and_debugging_tools/slides-sc23.pdf: SESSIONKEY=04 +title-slide.05.tex ../../05-L_Optimization_techniques_for_multi-GPU_applications/slides-sc23.pdf: SESSIONKEY=05 +title-slide.07.tex ../../07-L_Overview_of_NCCL_and_NVSHMEM_in_MPI_Programs/slides-sc23.pdf: SESSIONKEY=07 +title-slide.09.tex ../../09-L_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/slides-sc23.pdf: SESSIONKEY=09 +title-slide.11.tex ../../11-L_Summary_Advanced/slides-sc23.pdf: SESSIONKEY=11 +title-slide.01.tex title-slide.01b.tex title-slide.02.tex title-slide.04.tex title-slide.05.tex title-slide.07.tex title-slide.09.tex title-slide.11.tex: title-slide.tex + python3 gen-titleslide.py --author "$(shell cat sessions.yml | yq .$(SESSIONKEY).author)" --title "$(shell cat sessions.yml | yq .$(SESSIONKEY).title)" --out "$@" + +../../01-L_Introduction_Overview/slides-sc23.pdf: BASEDECK=../../01-L_Introduction_Overview/slides.pdf +../../01b-H_Onboarding/slides-sc23.pdf: BASEDECK=../../01b-H_Onboarding/slides.pdf +../../02-L_Introduction_to_MPI-Distributed_Computing_with_GPUs/slides-sc23.pdf: BASEDECK=../../02-L_Introduction_to_MPI-Distributed_Computing_with_GPUs/slides.pdf +../../04-L_Performance_and_debugging_tools/slides-sc23.pdf: BASEDECK=../../04-L_Performance_and_debugging_tools/slides.pdf +../../05-L_Optimization_techniques_for_multi-GPU_applications/slides-sc23.pdf: BASEDECK=../../05-L_Optimization_techniques_for_multi-GPU_applications/slides.pdf +../../07-L_Overview_of_NCCL_and_NVSHMEM_in_MPI_Programs/slides-sc23.pdf: BASEDECK=../../07-L_Overview_of_NCCL_and_NVSHMEM_in_MPI_Programs/slides.pdf +../../09-L_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/slides-sc23.pdf: BASEDECK=../../09-L_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/slides.pdf +../../11-L_Summary_Advanced/slides-sc23.pdf: BASEDECK=../../11-L_Summary_Advanced/slides.pdf + +.SECONDEXPANSION: +%-sc23.pdf: %.pdf title-slide.$$(SESSIONKEY).tex $(BASEDECK) + latexmk -output-directory=$(MYTMPDIR) -jobname=${SESSIONKEY} -pdfxe title-slide.$(SESSIONKEY).tex + cp $(MYTMPDIR)/${SESSIONKEY}.pdf title-slide.$(SESSIONKEY).pdf + papersize=$(shell pdfinfo title-slide.$(SESSIONKEY).pdf | awk '/^Page size:/ {printf "{%fbp,%fbp}", $$3, $$5}') && \ + pdfjam --outfile tmp.pdf --papersize "$$papersize" $(BASEDECK) + mutool merge -o $@ title-slide.$(SESSIONKEY).pdf 0 tmp.pdf + rm tmp.pdf \ No newline at end of file diff --git a/.etc/sc23-titleslides/sc-background.png b/.etc/sc23-titleslides/sc-background.png new file mode 100644 index 0000000..d5c8d61 Binary files /dev/null and b/.etc/sc23-titleslides/sc-background.png differ diff --git a/.etc/sc22-titleslides/sessions.yml b/.etc/sc23-titleslides/sessions.yml similarity index 74% rename from .etc/sc22-titleslides/sessions.yml rename to .etc/sc23-titleslides/sessions.yml index d70d41d..e36192e 100644 --- a/.etc/sc22-titleslides/sessions.yml +++ b/.etc/sc23-titleslides/sessions.yml @@ -4,11 +4,17 @@ 01b: title: 'Onboarding JUWELS Booster' author: 'Andreas Herten, JSC' +02: + title: 'Introduction to MPI-Distributed Computing with GPUs' + author: 'Simon Garcia, SNL' 04: title: 'Peformance and Debugging Tools' author: 'Markus Hrywniak, NVIDIA' 05: title: 'Optimization Techniques for Multi-GPU Applications' + author: 'Simon Garcia, SNL' +07: + title: 'NCCL and Host-Initiated NVSHMEM' author: 'Jiri Kraus, NVIDIA' 09: title: 'CUDA Graphs and Device-initiated Communication with NVSHMEM' diff --git a/.etc/sc23-titleslides/title-slide.pdf b/.etc/sc23-titleslides/title-slide.pdf new file mode 100644 index 0000000..abc39f9 Binary files /dev/null and b/.etc/sc23-titleslides/title-slide.pdf differ diff --git a/.etc/sc22-titleslides/title-slide.tex b/.etc/sc23-titleslides/title-slide.tex similarity index 55% rename from .etc/sc22-titleslides/title-slide.tex rename to .etc/sc23-titleslides/title-slide.tex index 552391a..23bb847 100644 --- a/.etc/sc22-titleslides/title-slide.tex +++ b/.etc/sc23-titleslides/title-slide.tex @@ -8,14 +8,16 @@ \setbeamertemplate{navigation symbols}{} \setbeamertemplate{background}{\includegraphics[width=\paperwidth,height=\paperheight]{sc-background.png}} -\setbeamercolor{normal text}{fg=white} +\setbeamercolor{normal text}{fg=black} \begin{document} \newcommand{\mytitle}{INSERT TITLE HERE} \newcommand{\myauthor}{Insert Author Here} \begin{frame}[plain, t] -\vspace*{0.5\paperheight}\LARGE -\textbf{\mytitle}\\\large -\myauthor + \begin{tikzpicture}[overlay, remember picture, align=left, text width=0.85\paperwidth] + \coordinate (main anchor) at ([yshift=0.4\paperheight,xshift=0.06\paperwidth]current page.south west); + \node (title) at (main anchor) [anchor=south west, font=\LARGE\bfseries] {\mytitle}; + \node (subtitle) at ([yshift=-0.05\paperheight]main anchor) [anchor=north west, font=\large, text=white] {\myauthor}; + \end{tikzpicture} \end{frame} \end{document} diff --git a/.gitignore b/.gitignore index 3afa2cf..82bf06c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ -tut102-multi-gpu.tar.gz -*-sc22.pdftut* +tut140-multi-gpu.tar.gz +*-sc23.pdf +tut* diff --git a/01-L_Introduction_Overview/slides.pdf b/01-L_Introduction_Overview/slides.pdf index 653c54a..351f0f9 100644 Binary files a/01-L_Introduction_Overview/slides.pdf and b/01-L_Introduction_Overview/slides.pdf differ diff --git a/01b-H_Onboarding/onboarding.pdf b/01b-H_Onboarding/onboarding.pdf new file mode 100644 index 0000000..3fb3382 Binary files /dev/null and b/01b-H_Onboarding/onboarding.pdf differ diff --git a/01b-H_Onboarding/slides.pdf b/01b-H_Onboarding/slides.pdf index 7d7706e..bdb4d56 100644 Binary files a/01b-H_Onboarding/slides.pdf and b/01b-H_Onboarding/slides.pdf differ diff --git a/02-L_Introduction_to_MPI-Distributed_Computing_with_GPUs/slides.pdf b/02-L_Introduction_to_MPI-Distributed_Computing_with_GPUs/slides.pdf index dec1995..e0914bc 100644 Binary files a/02-L_Introduction_to_MPI-Distributed_Computing_with_GPUs/slides.pdf and b/02-L_Introduction_to_MPI-Distributed_Computing_with_GPUs/slides.pdf differ diff --git a/03-H_Multi_GPU_Parallelization/.master/Instructions.ipynb b/03-H_Multi_GPU_Parallelization/.master/Instructions.ipynb index 46b261f..29efb90 100644 --- a/03-H_Multi_GPU_Parallelization/.master/Instructions.ipynb +++ b/03-H_Multi_GPU_Parallelization/.master/Instructions.ipynb @@ -4,14 +4,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# ISC23 Tutorial: Efficient Distributed GPU Programming for Exascale\n", + "# SC23 Tutorial: Efficient Distributed GPU Programming for Exascale\n", "\n", - "- Time: Sunday, 21 May 2023 9:00 - 18:00 CEST\n", - "- Location: Hall Y8, Congress Center Hamburg, Germany\n", + "- Time: Monday, 13 November 2023 8:30 - 17:00 MST\n", + "- Location: 405, Denver Congress Centre, USA\n", "- Program Link:\n", - " https://app.swapcard.com/widget/event/isc-high-performance-2023/planning/UGxhbm5pbmdfMTIyMDc5OA==\n", - "\n", - "## Hands-On 3: Multi-GPU Parallelization with CUDA-aware MPI\n", + " https://sc23.supercomputing.org/presentation/?id=tut140&sess=sess242\n", + " \\## Hands-On 3: Multi-GPU Parallelization with CUDA-aware MPI\n", "\n", "### Task: Parallelize Jacobi Solver for Multiple GPUs using CUDA-aware MPI\n", "\n", @@ -71,7 +70,7 @@ " size + 1 rows\n", "- Adapt the computation of (`iy_start_global`)" ], - "id": "c9546c58-9fec-4bb4-9d7f-1eaa1c686c7b" + "id": "b99f8bbd-dc74-4837-bc8a-aa5ebd315fe7" } ], "nbformat": 4, diff --git a/03-H_Multi_GPU_Parallelization/.master/Instructions.md b/03-H_Multi_GPU_Parallelization/.master/Instructions.md index c8992d5..1095e4f 100644 --- a/03-H_Multi_GPU_Parallelization/.master/Instructions.md +++ b/03-H_Multi_GPU_Parallelization/.master/Instructions.md @@ -1,10 +1,9 @@ -# ISC23 Tutorial: Efficient Distributed GPU Programming for Exascale +# SC23 Tutorial: Efficient Distributed GPU Programming for Exascale -- Time: Sunday, 21 May 2023 9:00 - 18:00 CEST -- Location: Hall Y8, Congress Center Hamburg, Germany +- Time: Monday, 13 November 2023 8:30 - 17:00 MST +- Location: 405, Denver Congress Centre, USA - Program Link: - https://app.swapcard.com/widget/event/isc-high-performance-2023/planning/UGxhbm5pbmdfMTIyMDc5OA== - + https://sc23.supercomputing.org/presentation/?id=tut140&sess=sess242 ## Hands-On 3: Multi-GPU Parallelization with CUDA-aware MPI ### Task: Parallelize Jacobi Solver for Multiple GPUs using CUDA-aware MPI diff --git a/03-H_Multi_GPU_Parallelization/solutions/Instructions.ipynb b/03-H_Multi_GPU_Parallelization/solutions/Instructions.ipynb index 46b261f..29efb90 100644 --- a/03-H_Multi_GPU_Parallelization/solutions/Instructions.ipynb +++ b/03-H_Multi_GPU_Parallelization/solutions/Instructions.ipynb @@ -4,14 +4,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# ISC23 Tutorial: Efficient Distributed GPU Programming for Exascale\n", + "# SC23 Tutorial: Efficient Distributed GPU Programming for Exascale\n", "\n", - "- Time: Sunday, 21 May 2023 9:00 - 18:00 CEST\n", - "- Location: Hall Y8, Congress Center Hamburg, Germany\n", + "- Time: Monday, 13 November 2023 8:30 - 17:00 MST\n", + "- Location: 405, Denver Congress Centre, USA\n", "- Program Link:\n", - " https://app.swapcard.com/widget/event/isc-high-performance-2023/planning/UGxhbm5pbmdfMTIyMDc5OA==\n", - "\n", - "## Hands-On 3: Multi-GPU Parallelization with CUDA-aware MPI\n", + " https://sc23.supercomputing.org/presentation/?id=tut140&sess=sess242\n", + " \\## Hands-On 3: Multi-GPU Parallelization with CUDA-aware MPI\n", "\n", "### Task: Parallelize Jacobi Solver for Multiple GPUs using CUDA-aware MPI\n", "\n", @@ -71,7 +70,7 @@ " size + 1 rows\n", "- Adapt the computation of (`iy_start_global`)" ], - "id": "c9546c58-9fec-4bb4-9d7f-1eaa1c686c7b" + "id": "b99f8bbd-dc74-4837-bc8a-aa5ebd315fe7" } ], "nbformat": 4, diff --git a/03-H_Multi_GPU_Parallelization/solutions/Instructions.md b/03-H_Multi_GPU_Parallelization/solutions/Instructions.md index c8992d5..1095e4f 100644 --- a/03-H_Multi_GPU_Parallelization/solutions/Instructions.md +++ b/03-H_Multi_GPU_Parallelization/solutions/Instructions.md @@ -1,10 +1,9 @@ -# ISC23 Tutorial: Efficient Distributed GPU Programming for Exascale +# SC23 Tutorial: Efficient Distributed GPU Programming for Exascale -- Time: Sunday, 21 May 2023 9:00 - 18:00 CEST -- Location: Hall Y8, Congress Center Hamburg, Germany +- Time: Monday, 13 November 2023 8:30 - 17:00 MST +- Location: 405, Denver Congress Centre, USA - Program Link: - https://app.swapcard.com/widget/event/isc-high-performance-2023/planning/UGxhbm5pbmdfMTIyMDc5OA== - + https://sc23.supercomputing.org/presentation/?id=tut140&sess=sess242 ## Hands-On 3: Multi-GPU Parallelization with CUDA-aware MPI ### Task: Parallelize Jacobi Solver for Multiple GPUs using CUDA-aware MPI diff --git a/03-H_Multi_GPU_Parallelization/solutions/advanced/Instructions.ipynb b/03-H_Multi_GPU_Parallelization/solutions/advanced/Instructions.ipynb index 46b261f..29efb90 100644 --- a/03-H_Multi_GPU_Parallelization/solutions/advanced/Instructions.ipynb +++ b/03-H_Multi_GPU_Parallelization/solutions/advanced/Instructions.ipynb @@ -4,14 +4,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# ISC23 Tutorial: Efficient Distributed GPU Programming for Exascale\n", + "# SC23 Tutorial: Efficient Distributed GPU Programming for Exascale\n", "\n", - "- Time: Sunday, 21 May 2023 9:00 - 18:00 CEST\n", - "- Location: Hall Y8, Congress Center Hamburg, Germany\n", + "- Time: Monday, 13 November 2023 8:30 - 17:00 MST\n", + "- Location: 405, Denver Congress Centre, USA\n", "- Program Link:\n", - " https://app.swapcard.com/widget/event/isc-high-performance-2023/planning/UGxhbm5pbmdfMTIyMDc5OA==\n", - "\n", - "## Hands-On 3: Multi-GPU Parallelization with CUDA-aware MPI\n", + " https://sc23.supercomputing.org/presentation/?id=tut140&sess=sess242\n", + " \\## Hands-On 3: Multi-GPU Parallelization with CUDA-aware MPI\n", "\n", "### Task: Parallelize Jacobi Solver for Multiple GPUs using CUDA-aware MPI\n", "\n", @@ -71,7 +70,7 @@ " size + 1 rows\n", "- Adapt the computation of (`iy_start_global`)" ], - "id": "c9546c58-9fec-4bb4-9d7f-1eaa1c686c7b" + "id": "b99f8bbd-dc74-4837-bc8a-aa5ebd315fe7" } ], "nbformat": 4, diff --git a/03-H_Multi_GPU_Parallelization/solutions/advanced/Instructions.md b/03-H_Multi_GPU_Parallelization/solutions/advanced/Instructions.md index c8992d5..1095e4f 100644 --- a/03-H_Multi_GPU_Parallelization/solutions/advanced/Instructions.md +++ b/03-H_Multi_GPU_Parallelization/solutions/advanced/Instructions.md @@ -1,10 +1,9 @@ -# ISC23 Tutorial: Efficient Distributed GPU Programming for Exascale +# SC23 Tutorial: Efficient Distributed GPU Programming for Exascale -- Time: Sunday, 21 May 2023 9:00 - 18:00 CEST -- Location: Hall Y8, Congress Center Hamburg, Germany +- Time: Monday, 13 November 2023 8:30 - 17:00 MST +- Location: 405, Denver Congress Centre, USA - Program Link: - https://app.swapcard.com/widget/event/isc-high-performance-2023/planning/UGxhbm5pbmdfMTIyMDc5OA== - + https://sc23.supercomputing.org/presentation/?id=tut140&sess=sess242 ## Hands-On 3: Multi-GPU Parallelization with CUDA-aware MPI ### Task: Parallelize Jacobi Solver for Multiple GPUs using CUDA-aware MPI diff --git a/03-H_Multi_GPU_Parallelization/tasks/Instructions.ipynb b/03-H_Multi_GPU_Parallelization/tasks/Instructions.ipynb index 46b261f..29efb90 100644 --- a/03-H_Multi_GPU_Parallelization/tasks/Instructions.ipynb +++ b/03-H_Multi_GPU_Parallelization/tasks/Instructions.ipynb @@ -4,14 +4,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# ISC23 Tutorial: Efficient Distributed GPU Programming for Exascale\n", + "# SC23 Tutorial: Efficient Distributed GPU Programming for Exascale\n", "\n", - "- Time: Sunday, 21 May 2023 9:00 - 18:00 CEST\n", - "- Location: Hall Y8, Congress Center Hamburg, Germany\n", + "- Time: Monday, 13 November 2023 8:30 - 17:00 MST\n", + "- Location: 405, Denver Congress Centre, USA\n", "- Program Link:\n", - " https://app.swapcard.com/widget/event/isc-high-performance-2023/planning/UGxhbm5pbmdfMTIyMDc5OA==\n", - "\n", - "## Hands-On 3: Multi-GPU Parallelization with CUDA-aware MPI\n", + " https://sc23.supercomputing.org/presentation/?id=tut140&sess=sess242\n", + " \\## Hands-On 3: Multi-GPU Parallelization with CUDA-aware MPI\n", "\n", "### Task: Parallelize Jacobi Solver for Multiple GPUs using CUDA-aware MPI\n", "\n", @@ -71,7 +70,7 @@ " size + 1 rows\n", "- Adapt the computation of (`iy_start_global`)" ], - "id": "c9546c58-9fec-4bb4-9d7f-1eaa1c686c7b" + "id": "b99f8bbd-dc74-4837-bc8a-aa5ebd315fe7" } ], "nbformat": 4, diff --git a/03-H_Multi_GPU_Parallelization/tasks/Instructions.md b/03-H_Multi_GPU_Parallelization/tasks/Instructions.md index c8992d5..1095e4f 100644 --- a/03-H_Multi_GPU_Parallelization/tasks/Instructions.md +++ b/03-H_Multi_GPU_Parallelization/tasks/Instructions.md @@ -1,10 +1,9 @@ -# ISC23 Tutorial: Efficient Distributed GPU Programming for Exascale +# SC23 Tutorial: Efficient Distributed GPU Programming for Exascale -- Time: Sunday, 21 May 2023 9:00 - 18:00 CEST -- Location: Hall Y8, Congress Center Hamburg, Germany +- Time: Monday, 13 November 2023 8:30 - 17:00 MST +- Location: 405, Denver Congress Centre, USA - Program Link: - https://app.swapcard.com/widget/event/isc-high-performance-2023/planning/UGxhbm5pbmdfMTIyMDc5OA== - + https://sc23.supercomputing.org/presentation/?id=tut140&sess=sess242 ## Hands-On 3: Multi-GPU Parallelization with CUDA-aware MPI ### Task: Parallelize Jacobi Solver for Multiple GPUs using CUDA-aware MPI diff --git a/04-L_Performance_and_debugging_tools/slides.pdf b/04-L_Performance_and_debugging_tools/slides.pdf index a854d41..155d6c9 100644 Binary files a/04-L_Performance_and_debugging_tools/slides.pdf and b/04-L_Performance_and_debugging_tools/slides.pdf differ diff --git a/05-L_Optimization_techniques_for_multi-GPU_applications/slides.pdf b/05-L_Optimization_techniques_for_multi-GPU_applications/slides.pdf index db220db..a235051 100644 Binary files a/05-L_Optimization_techniques_for_multi-GPU_applications/slides.pdf and b/05-L_Optimization_techniques_for_multi-GPU_applications/slides.pdf differ diff --git a/06-H_Overlap_Communication_and_Computation_MPI/.master/Instructions.ipynb b/06-H_Overlap_Communication_and_Computation_MPI/.master/Instructions.ipynb index 5f851fd..f8ef908 100644 --- a/06-H_Overlap_Communication_and_Computation_MPI/.master/Instructions.ipynb +++ b/06-H_Overlap_Communication_and_Computation_MPI/.master/Instructions.ipynb @@ -4,14 +4,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# ISC23 Tutorial: Efficient Distributed GPU Programming for Exascale\n", + "# SC23 Tutorial: Efficient Distributed GPU Programming for Exascale\n", "\n", - "- Time: Sunday, 21 May 2023 9:00 - 18:00 CEST\n", - "- Location: Hall Y8, Congress Center Hamburg, Germany\n", + "- Time: Monday, 13 November 2023 8:30 - 17:00 MST\n", + "- Location: 405, Denver Congress Centre, USA\n", "- Program Link:\n", - " https://app.swapcard.com/widget/event/isc-high-performance-2023/planning/UGxhbm5pbmdfMTIyMDc5OA==\n", - "\n", - "## Hands-On 6: Overlap Communication and Computation with MPI\n", + " https://sc23.supercomputing.org/presentation/?id=tut140&sess=sess242\n", + " \\## Hands-On 6: Overlap Communication and Computation with MPI\n", "\n", "You are now going to apply the concepts you learned in the lectures 4\n", "and 5: Using profiling tools, and applying them to implement overlapping\n", @@ -85,7 +84,7 @@ "- Destroy the additional cuda streams and events before ending the\n", " application" ], - "id": "634784f1-27fa-4390-9be3-6e9c1b18e102" + "id": "02d50eab-6f3d-4b5d-8743-4032717ba48f" } ], "nbformat": 4, diff --git a/06-H_Overlap_Communication_and_Computation_MPI/.master/Instructions.md b/06-H_Overlap_Communication_and_Computation_MPI/.master/Instructions.md index 12083b0..42ee407 100644 --- a/06-H_Overlap_Communication_and_Computation_MPI/.master/Instructions.md +++ b/06-H_Overlap_Communication_and_Computation_MPI/.master/Instructions.md @@ -1,10 +1,9 @@ -# ISC23 Tutorial: Efficient Distributed GPU Programming for Exascale +# SC23 Tutorial: Efficient Distributed GPU Programming for Exascale -- Time: Sunday, 21 May 2023 9:00 - 18:00 CEST -- Location: Hall Y8, Congress Center Hamburg, Germany +- Time: Monday, 13 November 2023 8:30 - 17:00 MST +- Location: 405, Denver Congress Centre, USA - Program Link: - https://app.swapcard.com/widget/event/isc-high-performance-2023/planning/UGxhbm5pbmdfMTIyMDc5OA== - + https://sc23.supercomputing.org/presentation/?id=tut140&sess=sess242 ## Hands-On 6: Overlap Communication and Computation with MPI You are now going to apply the concepts you learned in the lectures 4 and 5: Using profiling tools, diff --git a/06-H_Overlap_Communication_and_Computation_MPI/.master/copy.mk b/06-H_Overlap_Communication_and_Computation_MPI/.master/copy.mk index 8a96f59..b621217 100755 --- a/06-H_Overlap_Communication_and_Computation_MPI/.master/copy.mk +++ b/06-H_Overlap_Communication_and_Computation_MPI/.master/copy.mk @@ -6,7 +6,7 @@ SOLUTIONDIR = ../solutions/ IYPNB_TEMPLATE = ../../.template.json PROCESSFILES = jacobi.cpp -COPYFILES = Makefile Instructions.ipynb jacobi_kernels.cu +COPYFILES = Makefile Instructions.ipynb Instructions.md jacobi_kernels.cu TASKPROCCESFILES = $(addprefix $(TASKDIR)/,$(PROCESSFILES)) diff --git a/06-H_Overlap_Communication_and_Computation_MPI/solutions/Instructions.ipynb b/06-H_Overlap_Communication_and_Computation_MPI/solutions/Instructions.ipynb index 5f851fd..f8ef908 100644 --- a/06-H_Overlap_Communication_and_Computation_MPI/solutions/Instructions.ipynb +++ b/06-H_Overlap_Communication_and_Computation_MPI/solutions/Instructions.ipynb @@ -4,14 +4,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# ISC23 Tutorial: Efficient Distributed GPU Programming for Exascale\n", + "# SC23 Tutorial: Efficient Distributed GPU Programming for Exascale\n", "\n", - "- Time: Sunday, 21 May 2023 9:00 - 18:00 CEST\n", - "- Location: Hall Y8, Congress Center Hamburg, Germany\n", + "- Time: Monday, 13 November 2023 8:30 - 17:00 MST\n", + "- Location: 405, Denver Congress Centre, USA\n", "- Program Link:\n", - " https://app.swapcard.com/widget/event/isc-high-performance-2023/planning/UGxhbm5pbmdfMTIyMDc5OA==\n", - "\n", - "## Hands-On 6: Overlap Communication and Computation with MPI\n", + " https://sc23.supercomputing.org/presentation/?id=tut140&sess=sess242\n", + " \\## Hands-On 6: Overlap Communication and Computation with MPI\n", "\n", "You are now going to apply the concepts you learned in the lectures 4\n", "and 5: Using profiling tools, and applying them to implement overlapping\n", @@ -85,7 +84,7 @@ "- Destroy the additional cuda streams and events before ending the\n", " application" ], - "id": "634784f1-27fa-4390-9be3-6e9c1b18e102" + "id": "02d50eab-6f3d-4b5d-8743-4032717ba48f" } ], "nbformat": 4, diff --git a/06-H_Overlap_Communication_and_Computation_MPI/solutions/Instructions.md b/06-H_Overlap_Communication_and_Computation_MPI/solutions/Instructions.md new file mode 100644 index 0000000..42ee407 --- /dev/null +++ b/06-H_Overlap_Communication_and_Computation_MPI/solutions/Instructions.md @@ -0,0 +1,61 @@ +# SC23 Tutorial: Efficient Distributed GPU Programming for Exascale + +- Time: Monday, 13 November 2023 8:30 - 17:00 MST +- Location: 405, Denver Congress Centre, USA +- Program Link: + https://sc23.supercomputing.org/presentation/?id=tut140&sess=sess242 +## Hands-On 6: Overlap Communication and Computation with MPI + +You are now going to apply the concepts you learned in the lectures 4 and 5: Using profiling tools, +and applying them to implement overlapping MPI with GPU kernels. + +Compile with + +``` {.bash} +make +``` + +Submit your compiled application to the batch system with + +``` {.bash} +make run +``` + +Study the performance by glimpsing at the profile generated with +`make profile`. For `make run` and `make profile` the environment variable `NP` can be set to change the number of processes. + +### Task 0: Profile the non-overlap MPI-CUDA version of the code + +Use the Nsight System profiler to profile the starting point version non-Overlap MPI jacobi solver. The objective is to become familiar in navigating the GUI identify possible areas to overlap computation and communication. + +1. Start by compiling and running the application with `make run` +1. Record an Nsight Systems profile, using the appropriate Makefile target (`make profile`) +1. Open the recorded profile in the GUI + - Either: Install Nsight Systems locally, and transfer the .qdrep/.nsys-rep file + - Or: By running Xpra in your browser: In Jupyter, select "File > New Launcher" and "Xpra Desktop", which will open in a new tab. Don't forget to source the environment in your `xterm`. +1. Familiarize yourself with the different rows and the traces they represent. + - See if you can correlate a CUDA API kernel launch call and the resulting kernel execution on the device +1. Follow the lecture steps and identify the relevant section with overlap potential in your code + - Hint: Try navigating with the NVTX ranges. + + +### Task 1: Implement Communication/Computation overlap + +Realize the optimization potential you discovered in the previous task and reduce the whitespace between kernel calls on the GPU profile by implementing communication/computation overlap. + +You will need to separately calculate the boundary, and you should use high-priority streams. A less efficient (problem size-dependent) alternative to high-priority streams would be to launch the boundary processing kernels before the bulk kernel. +regions for the halo exchange. + +The starting point of this task is the non-overlapping MPI variant of the Jacobi solver. +Follow the `TODO`s in `jacobi.cpp`: + +- Query the priority range to be used by the CUDA streams +- Create new top and bottom CUDA streams and corresponding CUDA events +- Initialize all streams using priorities +- Modify the original call to `launch_jacobi_kernel` to not compute the top and bottom regions +- Add additional calls to `launch_jacobi_kernel` for the top and bottom regions using the high-priority streams +- Wait on both top and bottom streams when calculating the norm +- Synchronize top and bottom streams before applying the periodic boundary conditions using MPI +- Destroy the additional cuda streams and events before ending the application + + diff --git a/06-H_Overlap_Communication_and_Computation_MPI/tasks/Instructions.ipynb b/06-H_Overlap_Communication_and_Computation_MPI/tasks/Instructions.ipynb index 5f851fd..f8ef908 100644 --- a/06-H_Overlap_Communication_and_Computation_MPI/tasks/Instructions.ipynb +++ b/06-H_Overlap_Communication_and_Computation_MPI/tasks/Instructions.ipynb @@ -4,14 +4,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# ISC23 Tutorial: Efficient Distributed GPU Programming for Exascale\n", + "# SC23 Tutorial: Efficient Distributed GPU Programming for Exascale\n", "\n", - "- Time: Sunday, 21 May 2023 9:00 - 18:00 CEST\n", - "- Location: Hall Y8, Congress Center Hamburg, Germany\n", + "- Time: Monday, 13 November 2023 8:30 - 17:00 MST\n", + "- Location: 405, Denver Congress Centre, USA\n", "- Program Link:\n", - " https://app.swapcard.com/widget/event/isc-high-performance-2023/planning/UGxhbm5pbmdfMTIyMDc5OA==\n", - "\n", - "## Hands-On 6: Overlap Communication and Computation with MPI\n", + " https://sc23.supercomputing.org/presentation/?id=tut140&sess=sess242\n", + " \\## Hands-On 6: Overlap Communication and Computation with MPI\n", "\n", "You are now going to apply the concepts you learned in the lectures 4\n", "and 5: Using profiling tools, and applying them to implement overlapping\n", @@ -85,7 +84,7 @@ "- Destroy the additional cuda streams and events before ending the\n", " application" ], - "id": "634784f1-27fa-4390-9be3-6e9c1b18e102" + "id": "02d50eab-6f3d-4b5d-8743-4032717ba48f" } ], "nbformat": 4, diff --git a/06-H_Overlap_Communication_and_Computation_MPI/tasks/Instructions.md b/06-H_Overlap_Communication_and_Computation_MPI/tasks/Instructions.md new file mode 100644 index 0000000..42ee407 --- /dev/null +++ b/06-H_Overlap_Communication_and_Computation_MPI/tasks/Instructions.md @@ -0,0 +1,61 @@ +# SC23 Tutorial: Efficient Distributed GPU Programming for Exascale + +- Time: Monday, 13 November 2023 8:30 - 17:00 MST +- Location: 405, Denver Congress Centre, USA +- Program Link: + https://sc23.supercomputing.org/presentation/?id=tut140&sess=sess242 +## Hands-On 6: Overlap Communication and Computation with MPI + +You are now going to apply the concepts you learned in the lectures 4 and 5: Using profiling tools, +and applying them to implement overlapping MPI with GPU kernels. + +Compile with + +``` {.bash} +make +``` + +Submit your compiled application to the batch system with + +``` {.bash} +make run +``` + +Study the performance by glimpsing at the profile generated with +`make profile`. For `make run` and `make profile` the environment variable `NP` can be set to change the number of processes. + +### Task 0: Profile the non-overlap MPI-CUDA version of the code + +Use the Nsight System profiler to profile the starting point version non-Overlap MPI jacobi solver. The objective is to become familiar in navigating the GUI identify possible areas to overlap computation and communication. + +1. Start by compiling and running the application with `make run` +1. Record an Nsight Systems profile, using the appropriate Makefile target (`make profile`) +1. Open the recorded profile in the GUI + - Either: Install Nsight Systems locally, and transfer the .qdrep/.nsys-rep file + - Or: By running Xpra in your browser: In Jupyter, select "File > New Launcher" and "Xpra Desktop", which will open in a new tab. Don't forget to source the environment in your `xterm`. +1. Familiarize yourself with the different rows and the traces they represent. + - See if you can correlate a CUDA API kernel launch call and the resulting kernel execution on the device +1. Follow the lecture steps and identify the relevant section with overlap potential in your code + - Hint: Try navigating with the NVTX ranges. + + +### Task 1: Implement Communication/Computation overlap + +Realize the optimization potential you discovered in the previous task and reduce the whitespace between kernel calls on the GPU profile by implementing communication/computation overlap. + +You will need to separately calculate the boundary, and you should use high-priority streams. A less efficient (problem size-dependent) alternative to high-priority streams would be to launch the boundary processing kernels before the bulk kernel. +regions for the halo exchange. + +The starting point of this task is the non-overlapping MPI variant of the Jacobi solver. +Follow the `TODO`s in `jacobi.cpp`: + +- Query the priority range to be used by the CUDA streams +- Create new top and bottom CUDA streams and corresponding CUDA events +- Initialize all streams using priorities +- Modify the original call to `launch_jacobi_kernel` to not compute the top and bottom regions +- Add additional calls to `launch_jacobi_kernel` for the top and bottom regions using the high-priority streams +- Wait on both top and bottom streams when calculating the norm +- Synchronize top and bottom streams before applying the periodic boundary conditions using MPI +- Destroy the additional cuda streams and events before ending the application + + diff --git a/07-L_Overview_of_NCCL_and_NVSHMEM_in_MPI_Programs/slides.pdf b/07-L_Overview_of_NCCL_and_NVSHMEM_in_MPI_Programs/slides.pdf index 9b3a058..4a5ef6e 100644 Binary files a/07-L_Overview_of_NCCL_and_NVSHMEM_in_MPI_Programs/slides.pdf and b/07-L_Overview_of_NCCL_and_NVSHMEM_in_MPI_Programs/slides.pdf differ diff --git a/08-H_NCCL_NVSHMEM/.master/NCCL/Instructions.ipynb b/08-H_NCCL_NVSHMEM/.master/NCCL/Instructions.ipynb index f1f674b..2bd9d62 100644 --- a/08-H_NCCL_NVSHMEM/.master/NCCL/Instructions.ipynb +++ b/08-H_NCCL_NVSHMEM/.master/NCCL/Instructions.ipynb @@ -4,14 +4,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# ISC23 Tutorial: Efficient Distributed GPU Programming for Exascale\n", + "# SC23 Tutorial: Efficient Distributed GPU Programming for Exascale\n", "\n", - "- Time: Sunday, 21 May 2023 9:00 - 18:00 CEST\n", - "- Location: Hall Y8, Congress Center Hamburg, Germany\n", + "- Time: Monday, 13 November 2023 8:30 - 17:00 MST\n", + "- Location: 405, Denver Congress Centre, USA\n", "- Program Link:\n", - " https://app.swapcard.com/widget/event/isc-high-performance-2023/planning/UGxhbm5pbmdfMTIyMDc5OA==\n", - "\n", - "## Hands-On 8-NCCL: Using NCCL for Inter-GPU Communication\n", + " https://sc23.supercomputing.org/presentation/?id=tut140&sess=sess242\n", + " \\## Hands-On 8-NCCL: Using NCCL for Inter-GPU Communication\n", "\n", "### Task: Using NCCL\n", "\n", @@ -22,15 +21,14 @@ "variant of the jacobi solver. You need to work on `TODOs` in\n", "`jacobi.cpp`:\n", "\n", - "- Initialize NVSHMEM:\n", - " - Include NCCL headers.\n", - " - Create a NCCL unique ID, and initialize it\n", - " - Create a NCCL communicator and initilize it\n", - " - Replace the MPI_Sendrecv calls with ncclRecv and ncclSend calls\n", - " for the warmup stage\n", - " - Replace MPI for the periodic boundary conditions with NCCL\n", - " - Fix output message to indicate nccl rather than mpi\n", - " - Destroy NCCL comunicator\n", + "- Include NCCL headers.\n", + "- Create a NCCL unique ID, and initialize it\n", + "- Create a NCCL communicator and initialize it\n", + "- Replace the MPI_Sendrecv calls with ncclRecv and ncclSend calls for\n", + " the warmup stage\n", + "- Replace MPI for the periodic boundary conditions with NCCL\n", + "- Fix output message to indicate nccl rather than mpi\n", + "- Destroy NCCL communicator\n", "\n", "Compile with\n", "\n", @@ -48,7 +46,7 @@ "`make profile`. For `make run` and `make profile` the environment\n", "variable `NP` can be set to change the number of processes." ], - "id": "fcc1dec5-cae8-41fb-a41d-2f5f1e5c5509" + "id": "0179bd70-4d9b-4560-859f-31ecf7708e9e" } ], "nbformat": 4, diff --git a/08-H_NCCL_NVSHMEM/.master/NCCL/Instructions.md b/08-H_NCCL_NVSHMEM/.master/NCCL/Instructions.md index e44ea2a..7368f28 100644 --- a/08-H_NCCL_NVSHMEM/.master/NCCL/Instructions.md +++ b/08-H_NCCL_NVSHMEM/.master/NCCL/Instructions.md @@ -1,10 +1,9 @@ -# ISC23 Tutorial: Efficient Distributed GPU Programming for Exascale +# SC23 Tutorial: Efficient Distributed GPU Programming for Exascale -- Time: Sunday, 21 May 2023 9:00 - 18:00 CEST -- Location: Hall Y8, Congress Center Hamburg, Germany +- Time: Monday, 13 November 2023 8:30 - 17:00 MST +- Location: 405, Denver Congress Centre, USA - Program Link: - https://app.swapcard.com/widget/event/isc-high-performance-2023/planning/UGxhbm5pbmdfMTIyMDc5OA== - + https://sc23.supercomputing.org/presentation/?id=tut140&sess=sess242 ## Hands-On 8-NCCL: Using NCCL for Inter-GPU Communication ### Task: Using NCCL @@ -13,14 +12,13 @@ The purpose of this task is to use NCCL instead of MPI to implement a multi-GPU jacobi solver. The starting point of this task is the MPI variant of the jacobi solver. You need to work on `TODOs` in `jacobi.cpp`: -- Initialize NVSHMEM: - - Include NCCL headers. - - Create a NCCL unique ID, and initialize it - - Create a NCCL communicator and initilize it - - Replace the MPI\_Sendrecv calls with ncclRecv and ncclSend calls for the warmup stage - - Replace MPI for the periodic boundary conditions with NCCL - - Fix output message to indicate nccl rather than mpi - - Destroy NCCL comunicator +- Include NCCL headers. +- Create a NCCL unique ID, and initialize it +- Create a NCCL communicator and initialize it +- Replace the MPI\_Sendrecv calls with ncclRecv and ncclSend calls for the warmup stage +- Replace MPI for the periodic boundary conditions with NCCL +- Fix output message to indicate nccl rather than mpi +- Destroy NCCL communicator Compile with diff --git a/08-H_NCCL_NVSHMEM/.master/NCCL/jacobi.cpp b/08-H_NCCL_NVSHMEM/.master/NCCL/jacobi.cpp index 33760e2..1995287 100644 --- a/08-H_NCCL_NVSHMEM/.master/NCCL/jacobi.cpp +++ b/08-H_NCCL_NVSHMEM/.master/NCCL/jacobi.cpp @@ -271,7 +271,7 @@ int main(int argc, char* argv[]) { real* l2_norm_h; CUDA_RT_CALL(cudaMallocHost(&l2_norm_h, sizeof(real))); -//TODO: Rename range + //TODO: Rename range #ifdef SOLUTION PUSH_RANGE("NCCL_Warmup", 5) #else @@ -326,7 +326,7 @@ int main(int argc, char* argv[]) { CUDA_RT_CALL(cudaStreamWaitEvent(push_stream, reset_l2norm_done, 0)); calculate_norm = (iter % nccheck) == 0 || (!csv && (iter % 100) == 0); - launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_start + 1), (iy_end - 1), nx, calculate_norm, + launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_start + 1), (iy_end - 1), nx, calculate_norm, compute_stream); launch_jacobi_kernel(a_new, a, l2_norm_d, iy_start, (iy_start + 1), nx, calculate_norm, @@ -346,7 +346,7 @@ int main(int argc, char* argv[]) { const int bottom = (rank + 1) % size; // Apply periodic boundary conditions - //TODO: Modify the lable for the RANGE, and replace MPI_Sendrecv with ncclSend and ncclRecv calls + //TODO: Modify the lable for the RANGE, and replace MPI_Sendrecv with ncclSend and ncclRecv calls // using the nccl communicator and push_stream. // Remember to use ncclGroupStart() and ncclGroupEnd() #ifdef SOLUTION @@ -358,14 +358,14 @@ int main(int argc, char* argv[]) { NCCL_CALL(ncclSend(a_new + iy_start * nx, nx, NCCL_REAL_TYPE, top, nccl_comm, push_stream)); NCCL_CALL(ncclGroupEnd()); #else - PUSH_RANGE("MPI", 5) + PUSH_RANGE("MPI", 5) MPI_CALL(MPI_Sendrecv(a_new + iy_start * nx, nx, MPI_REAL_TYPE, top, 0, a_new + (iy_end * nx), nx, MPI_REAL_TYPE, bottom, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE)); MPI_CALL(MPI_Sendrecv(a_new + (iy_end - 1) * nx, nx, MPI_REAL_TYPE, bottom, 0, a_new, nx, MPI_REAL_TYPE, top, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE)); #endif - CUDA_RT_CALL(cudaEventRecord(push_done, push_stream)); + CUDA_RT_CALL(cudaEventRecord(push_done, push_stream)); POP_RANGE if (calculate_norm) { @@ -410,13 +410,13 @@ int main(int argc, char* argv[]) { if (rank == 0 && result_correct) { if (csv) { - //TODO: Dont forget to change your output lable from mpi_overlap to nccl_overlap + //TODO: Dont forget to change your output lable from mpi_overlap to nccl_overlap #ifdef SOLUTION printf("nccl_overlap, %d, %d, %d, %d, %d, 1, %f, %f\n", nx, ny, iter_max, nccheck, size, #else - printf("mpi_overlap, %d, %d, %d, %d, %d, 1, %f, %f\n", nx, ny, iter_max, nccheck, size, + printf("mpi_overlap, %d, %d, %d, %d, %d, 1, %f, %f\n", nx, ny, iter_max, nccheck, size, #endif - (stop - start), runtime_serial); + (stop - start), runtime_serial); } else { printf("Num GPUs: %d.\n", size); printf( diff --git a/08-H_NCCL_NVSHMEM/.master/NVSHMEM/Instructions.ipynb b/08-H_NCCL_NVSHMEM/.master/NVSHMEM/Instructions.ipynb index b7d479f..c8d814e 100644 --- a/08-H_NCCL_NVSHMEM/.master/NVSHMEM/Instructions.ipynb +++ b/08-H_NCCL_NVSHMEM/.master/NVSHMEM/Instructions.ipynb @@ -4,14 +4,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# ISC23 Tutorial: Efficient Distributed GPU Programming for Exascale\n", + "# SC23 Tutorial: Efficient Distributed GPU Programming for Exascale\n", "\n", - "- Time: Sunday, 21 May 2023 9:00 - 18:00 CEST\n", - "- Location: Hall Y8, Congress Center Hamburg, Germany\n", + "- Time: Monday, 13 November 2023 8:30 - 17:00 MST\n", + "- Location: 405, Denver Congress Centre, USA\n", "- Program Link:\n", - " https://app.swapcard.com/widget/event/isc-high-performance-2023/planning/UGxhbm5pbmdfMTIyMDc5OA==\n", - "\n", - "## Hands-On 8-NVSHMEM: Host-initiated Communication with NVSHMEM\n", + " https://sc23.supercomputing.org/presentation/?id=tut140&sess=sess242\n", + " \\## Hands-On 8-NVSHMEM: Host-initiated Communication with NVSHMEM\n", "\n", "### Task: Using NVSHMEM device API\n", "\n", @@ -22,18 +21,17 @@ "is the MPI variant of the jacobi solver. You need to work on `TODOs` in\n", "`jacobi.cu`:\n", "\n", - "- Initialize NVSHMEM:\n", - " - Include NVSHMEM headers.\n", - " - Initialize NVSHMEM using `MPI_COMM_WORLD`.\n", - " - Allocate work arrays `a` and `a_new` from the NVSHMEM symmetric\n", - " heap. Take care of passing in a consistent size!\n", - " - Calculate halo/boundary row index of top and bottom neighbors.\n", - " - Add necessary inter PE synchronization.\n", - " - Replace MPI periodic boundary conditions with\n", - " `nvshmemx_float_put_on_stream` to directly push values needed by\n", - " top and bottom neighbors.\n", - " - Deallocate memory from the NVSHMEM symetric heap.\n", - " - Finalize NVSHMEM before existing the application\n", + "- Include NVSHMEM headers.\n", + "- Initialize NVSHMEM using `MPI_COMM_WORLD`.\n", + "- Allocate work arrays `a` and `a_new` from the NVSHMEM symmetric\n", + " heap. Take care of passing in a consistent size!\n", + "- Calculate halo/boundary row index of top and bottom neighbors.\n", + "- Add necessary inter PE synchronization.\n", + "- Replace MPI periodic boundary conditions with\n", + " `nvshmemx_float_put_on_stream` to directly push values needed by top\n", + " and bottom neighbors.\n", + "- Deallocate memory from the NVSHMEM symmetric heap.\n", + "- Finalize NVSHMEM before existing the application\n", "\n", "Compile with\n", "\n", @@ -64,7 +62,7 @@ "GPUs listed. This is automatically done for the `sanitize`, `run` and\n", "`profile` make targets." ], - "id": "c38ceeb2-bbb5-416b-a9ad-dd1dad8ac0e1" + "id": "688b0b98-419c-4dd3-a60f-f1f1d3018e63" } ], "nbformat": 4, diff --git a/08-H_NCCL_NVSHMEM/.master/NVSHMEM/Instructions.md b/08-H_NCCL_NVSHMEM/.master/NVSHMEM/Instructions.md index 38a3b32..13a5729 100644 --- a/08-H_NCCL_NVSHMEM/.master/NVSHMEM/Instructions.md +++ b/08-H_NCCL_NVSHMEM/.master/NVSHMEM/Instructions.md @@ -1,10 +1,9 @@ -# ISC23 Tutorial: Efficient Distributed GPU Programming for Exascale +# SC23 Tutorial: Efficient Distributed GPU Programming for Exascale -- Time: Sunday, 21 May 2023 9:00 - 18:00 CEST -- Location: Hall Y8, Congress Center Hamburg, Germany +- Time: Monday, 13 November 2023 8:30 - 17:00 MST +- Location: 405, Denver Congress Centre, USA - Program Link: - https://app.swapcard.com/widget/event/isc-high-performance-2023/planning/UGxhbm5pbmdfMTIyMDc5OA== - + https://sc23.supercomputing.org/presentation/?id=tut140&sess=sess242 ## Hands-On 8-NVSHMEM: Host-initiated Communication with NVSHMEM ### Task: Using NVSHMEM device API @@ -13,15 +12,14 @@ The purpose of this task is to use the NVSHMEM host API instead of MPI to implement a multi-GPU jacobi solver. The starting point of this task is the MPI variant of the jacobi solver. You need to work on `TODOs` in `jacobi.cu`: -- Initialize NVSHMEM: - - Include NVSHMEM headers. - - Initialize NVSHMEM using `MPI_COMM_WORLD`. - - Allocate work arrays `a` and `a_new` from the NVSHMEM symmetric heap. Take care of passing in a consistent size! - - Calculate halo/boundary row index of top and bottom neighbors. - - Add necessary inter PE synchronization. - - Replace MPI periodic boundary conditions with `nvshmemx_float_put_on_stream` to directly push values needed by top and bottom neighbors. - - Deallocate memory from the NVSHMEM symetric heap. - - Finalize NVSHMEM before existing the application +- Include NVSHMEM headers. +- Initialize NVSHMEM using `MPI_COMM_WORLD`. +- Allocate work arrays `a` and `a_new` from the NVSHMEM symmetric heap. Take care of passing in a consistent size! +- Calculate halo/boundary row index of top and bottom neighbors. +- Add necessary inter PE synchronization. +- Replace MPI periodic boundary conditions with `nvshmemx_float_put_on_stream` to directly push values needed by top and bottom neighbors. +- Deallocate memory from the NVSHMEM symmetric heap. +- Finalize NVSHMEM before existing the application Compile with diff --git a/08-H_NCCL_NVSHMEM/.master/NVSHMEM/jacobi.cu b/08-H_NCCL_NVSHMEM/.master/NVSHMEM/jacobi.cu index e755c7c..e4f6bcd 100644 --- a/08-H_NCCL_NVSHMEM/.master/NVSHMEM/jacobi.cu +++ b/08-H_NCCL_NVSHMEM/.master/NVSHMEM/jacobi.cu @@ -355,11 +355,11 @@ int main(int argc, char* argv[]) { compute_stream)); } - //TODO: Replace MPI communication with Host initiated NVSHMEM calls + //TODO: Replace MPI communication with Host initiated NVSHMEM calls // Apply periodic boundary conditions #ifdef SOLUTION - PUSH_RANGE("NVSHMEM", 5) - nvshmemx_float_put_on_stream(a_new + iy_top_lower_boundary_idx * nx, a_new + iy_start * nx, nx, top, push_stream); + PUSH_RANGE("NVSHMEM", 5) + nvshmemx_float_put_on_stream(a_new + iy_top_lower_boundary_idx * nx, a_new + iy_start * nx, nx, top, push_stream); nvshmemx_float_put_on_stream(a_new + iy_bottom_upper_boundary_idx * nx, a_new + (iy_end - 1) * nx, nx, bottom, push_stream); #else PUSH_RANGE("MPI", 5) @@ -369,12 +369,12 @@ int main(int argc, char* argv[]) { MPI_CALL(MPI_Sendrecv(a_new + (iy_end - 1) * nx, nx, MPI_REAL_TYPE, bottom, 0, a_new, nx, MPI_REAL_TYPE, top, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE)); #endif - CUDA_RT_CALL(cudaEventRecord(push_done, push_stream)); + CUDA_RT_CALL(cudaEventRecord(push_done, push_stream)); POP_RANGE CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, push_done, 0)); - //TODO: add necessary inter PE synchronization using the nvshmemx_barrier_all_on_stream(...) + //TODO: add necessary inter PE synchronization using the nvshmemx_barrier_all_on_stream(...) #ifdef SOLUTION nvshmemx_barrier_all_on_stream(compute_stream); #endif @@ -419,9 +419,9 @@ int main(int argc, char* argv[]) { if (rank == 0 && result_correct) { if (csv) { -//TODO: Replace MPI with NVSHMEM for your output + //TODO: Replace MPI with NVSHMEM for your output #ifdef SOLUTION - printf("nvshmem, %d, %d, %d, %d, %d, 1, %f, %f\n", nx, ny, iter_max, nccheck, size, + printf("nvshmem, %d, %d, %d, %d, %d, 1, %f, %f\n", nx, ny, iter_max, nccheck, size, #else printf("mpi, %d, %d, %d, %d, %d, 1, %f, %f\n", nx, ny, iter_max, nccheck, size, #endif diff --git a/08-H_NCCL_NVSHMEM/solutions/NCCL/Instructions.ipynb b/08-H_NCCL_NVSHMEM/solutions/NCCL/Instructions.ipynb index f1f674b..2bd9d62 100644 --- a/08-H_NCCL_NVSHMEM/solutions/NCCL/Instructions.ipynb +++ b/08-H_NCCL_NVSHMEM/solutions/NCCL/Instructions.ipynb @@ -4,14 +4,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# ISC23 Tutorial: Efficient Distributed GPU Programming for Exascale\n", + "# SC23 Tutorial: Efficient Distributed GPU Programming for Exascale\n", "\n", - "- Time: Sunday, 21 May 2023 9:00 - 18:00 CEST\n", - "- Location: Hall Y8, Congress Center Hamburg, Germany\n", + "- Time: Monday, 13 November 2023 8:30 - 17:00 MST\n", + "- Location: 405, Denver Congress Centre, USA\n", "- Program Link:\n", - " https://app.swapcard.com/widget/event/isc-high-performance-2023/planning/UGxhbm5pbmdfMTIyMDc5OA==\n", - "\n", - "## Hands-On 8-NCCL: Using NCCL for Inter-GPU Communication\n", + " https://sc23.supercomputing.org/presentation/?id=tut140&sess=sess242\n", + " \\## Hands-On 8-NCCL: Using NCCL for Inter-GPU Communication\n", "\n", "### Task: Using NCCL\n", "\n", @@ -22,15 +21,14 @@ "variant of the jacobi solver. You need to work on `TODOs` in\n", "`jacobi.cpp`:\n", "\n", - "- Initialize NVSHMEM:\n", - " - Include NCCL headers.\n", - " - Create a NCCL unique ID, and initialize it\n", - " - Create a NCCL communicator and initilize it\n", - " - Replace the MPI_Sendrecv calls with ncclRecv and ncclSend calls\n", - " for the warmup stage\n", - " - Replace MPI for the periodic boundary conditions with NCCL\n", - " - Fix output message to indicate nccl rather than mpi\n", - " - Destroy NCCL comunicator\n", + "- Include NCCL headers.\n", + "- Create a NCCL unique ID, and initialize it\n", + "- Create a NCCL communicator and initialize it\n", + "- Replace the MPI_Sendrecv calls with ncclRecv and ncclSend calls for\n", + " the warmup stage\n", + "- Replace MPI for the periodic boundary conditions with NCCL\n", + "- Fix output message to indicate nccl rather than mpi\n", + "- Destroy NCCL communicator\n", "\n", "Compile with\n", "\n", @@ -48,7 +46,7 @@ "`make profile`. For `make run` and `make profile` the environment\n", "variable `NP` can be set to change the number of processes." ], - "id": "fcc1dec5-cae8-41fb-a41d-2f5f1e5c5509" + "id": "0179bd70-4d9b-4560-859f-31ecf7708e9e" } ], "nbformat": 4, diff --git a/08-H_NCCL_NVSHMEM/solutions/NCCL/Instructions.md b/08-H_NCCL_NVSHMEM/solutions/NCCL/Instructions.md index e44ea2a..7368f28 100644 --- a/08-H_NCCL_NVSHMEM/solutions/NCCL/Instructions.md +++ b/08-H_NCCL_NVSHMEM/solutions/NCCL/Instructions.md @@ -1,10 +1,9 @@ -# ISC23 Tutorial: Efficient Distributed GPU Programming for Exascale +# SC23 Tutorial: Efficient Distributed GPU Programming for Exascale -- Time: Sunday, 21 May 2023 9:00 - 18:00 CEST -- Location: Hall Y8, Congress Center Hamburg, Germany +- Time: Monday, 13 November 2023 8:30 - 17:00 MST +- Location: 405, Denver Congress Centre, USA - Program Link: - https://app.swapcard.com/widget/event/isc-high-performance-2023/planning/UGxhbm5pbmdfMTIyMDc5OA== - + https://sc23.supercomputing.org/presentation/?id=tut140&sess=sess242 ## Hands-On 8-NCCL: Using NCCL for Inter-GPU Communication ### Task: Using NCCL @@ -13,14 +12,13 @@ The purpose of this task is to use NCCL instead of MPI to implement a multi-GPU jacobi solver. The starting point of this task is the MPI variant of the jacobi solver. You need to work on `TODOs` in `jacobi.cpp`: -- Initialize NVSHMEM: - - Include NCCL headers. - - Create a NCCL unique ID, and initialize it - - Create a NCCL communicator and initilize it - - Replace the MPI\_Sendrecv calls with ncclRecv and ncclSend calls for the warmup stage - - Replace MPI for the periodic boundary conditions with NCCL - - Fix output message to indicate nccl rather than mpi - - Destroy NCCL comunicator +- Include NCCL headers. +- Create a NCCL unique ID, and initialize it +- Create a NCCL communicator and initialize it +- Replace the MPI\_Sendrecv calls with ncclRecv and ncclSend calls for the warmup stage +- Replace MPI for the periodic boundary conditions with NCCL +- Fix output message to indicate nccl rather than mpi +- Destroy NCCL communicator Compile with diff --git a/08-H_NCCL_NVSHMEM/solutions/NCCL/jacobi.cpp b/08-H_NCCL_NVSHMEM/solutions/NCCL/jacobi.cpp index 0b0b018..7b4e5fd 100644 --- a/08-H_NCCL_NVSHMEM/solutions/NCCL/jacobi.cpp +++ b/08-H_NCCL_NVSHMEM/solutions/NCCL/jacobi.cpp @@ -265,7 +265,7 @@ int main(int argc, char* argv[]) { real* l2_norm_h; CUDA_RT_CALL(cudaMallocHost(&l2_norm_h, sizeof(real))); -//TODO: Rename range + //TODO: Rename range PUSH_RANGE("NCCL_Warmup", 5) for (int i = 0; i < 10; ++i) { const int top = rank > 0 ? rank - 1 : (size - 1); @@ -308,7 +308,7 @@ int main(int argc, char* argv[]) { CUDA_RT_CALL(cudaStreamWaitEvent(push_stream, reset_l2norm_done, 0)); calculate_norm = (iter % nccheck) == 0 || (!csv && (iter % 100) == 0); - launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_start + 1), (iy_end - 1), nx, calculate_norm, + launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_start + 1), (iy_end - 1), nx, calculate_norm, compute_stream); launch_jacobi_kernel(a_new, a, l2_norm_d, iy_start, (iy_start + 1), nx, calculate_norm, @@ -328,7 +328,7 @@ int main(int argc, char* argv[]) { const int bottom = (rank + 1) % size; // Apply periodic boundary conditions - //TODO: Modify the lable for the RANGE, and replace MPI_Sendrecv with ncclSend and ncclRecv calls + //TODO: Modify the lable for the RANGE, and replace MPI_Sendrecv with ncclSend and ncclRecv calls // using the nccl communicator and push_stream. // Remember to use ncclGroupStart() and ncclGroupEnd() PUSH_RANGE("NCCL_LAUNCH", 5) @@ -338,7 +338,7 @@ int main(int argc, char* argv[]) { NCCL_CALL(ncclRecv(a_new + (iy_end * nx), nx, NCCL_REAL_TYPE, bottom, nccl_comm, push_stream)); NCCL_CALL(ncclSend(a_new + iy_start * nx, nx, NCCL_REAL_TYPE, top, nccl_comm, push_stream)); NCCL_CALL(ncclGroupEnd()); - CUDA_RT_CALL(cudaEventRecord(push_done, push_stream)); + CUDA_RT_CALL(cudaEventRecord(push_done, push_stream)); POP_RANGE if (calculate_norm) { @@ -383,9 +383,9 @@ int main(int argc, char* argv[]) { if (rank == 0 && result_correct) { if (csv) { - //TODO: Dont forget to change your output lable from mpi_overlap to nccl_overlap + //TODO: Dont forget to change your output lable from mpi_overlap to nccl_overlap printf("nccl_overlap, %d, %d, %d, %d, %d, 1, %f, %f\n", nx, ny, iter_max, nccheck, size, - (stop - start), runtime_serial); + (stop - start), runtime_serial); } else { printf("Num GPUs: %d.\n", size); printf( diff --git a/08-H_NCCL_NVSHMEM/solutions/NVSHMEM/Instructions.ipynb b/08-H_NCCL_NVSHMEM/solutions/NVSHMEM/Instructions.ipynb index b7d479f..c8d814e 100644 --- a/08-H_NCCL_NVSHMEM/solutions/NVSHMEM/Instructions.ipynb +++ b/08-H_NCCL_NVSHMEM/solutions/NVSHMEM/Instructions.ipynb @@ -4,14 +4,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# ISC23 Tutorial: Efficient Distributed GPU Programming for Exascale\n", + "# SC23 Tutorial: Efficient Distributed GPU Programming for Exascale\n", "\n", - "- Time: Sunday, 21 May 2023 9:00 - 18:00 CEST\n", - "- Location: Hall Y8, Congress Center Hamburg, Germany\n", + "- Time: Monday, 13 November 2023 8:30 - 17:00 MST\n", + "- Location: 405, Denver Congress Centre, USA\n", "- Program Link:\n", - " https://app.swapcard.com/widget/event/isc-high-performance-2023/planning/UGxhbm5pbmdfMTIyMDc5OA==\n", - "\n", - "## Hands-On 8-NVSHMEM: Host-initiated Communication with NVSHMEM\n", + " https://sc23.supercomputing.org/presentation/?id=tut140&sess=sess242\n", + " \\## Hands-On 8-NVSHMEM: Host-initiated Communication with NVSHMEM\n", "\n", "### Task: Using NVSHMEM device API\n", "\n", @@ -22,18 +21,17 @@ "is the MPI variant of the jacobi solver. You need to work on `TODOs` in\n", "`jacobi.cu`:\n", "\n", - "- Initialize NVSHMEM:\n", - " - Include NVSHMEM headers.\n", - " - Initialize NVSHMEM using `MPI_COMM_WORLD`.\n", - " - Allocate work arrays `a` and `a_new` from the NVSHMEM symmetric\n", - " heap. Take care of passing in a consistent size!\n", - " - Calculate halo/boundary row index of top and bottom neighbors.\n", - " - Add necessary inter PE synchronization.\n", - " - Replace MPI periodic boundary conditions with\n", - " `nvshmemx_float_put_on_stream` to directly push values needed by\n", - " top and bottom neighbors.\n", - " - Deallocate memory from the NVSHMEM symetric heap.\n", - " - Finalize NVSHMEM before existing the application\n", + "- Include NVSHMEM headers.\n", + "- Initialize NVSHMEM using `MPI_COMM_WORLD`.\n", + "- Allocate work arrays `a` and `a_new` from the NVSHMEM symmetric\n", + " heap. Take care of passing in a consistent size!\n", + "- Calculate halo/boundary row index of top and bottom neighbors.\n", + "- Add necessary inter PE synchronization.\n", + "- Replace MPI periodic boundary conditions with\n", + " `nvshmemx_float_put_on_stream` to directly push values needed by top\n", + " and bottom neighbors.\n", + "- Deallocate memory from the NVSHMEM symmetric heap.\n", + "- Finalize NVSHMEM before existing the application\n", "\n", "Compile with\n", "\n", @@ -64,7 +62,7 @@ "GPUs listed. This is automatically done for the `sanitize`, `run` and\n", "`profile` make targets." ], - "id": "c38ceeb2-bbb5-416b-a9ad-dd1dad8ac0e1" + "id": "688b0b98-419c-4dd3-a60f-f1f1d3018e63" } ], "nbformat": 4, diff --git a/08-H_NCCL_NVSHMEM/solutions/NVSHMEM/Instructions.md b/08-H_NCCL_NVSHMEM/solutions/NVSHMEM/Instructions.md index 38a3b32..13a5729 100644 --- a/08-H_NCCL_NVSHMEM/solutions/NVSHMEM/Instructions.md +++ b/08-H_NCCL_NVSHMEM/solutions/NVSHMEM/Instructions.md @@ -1,10 +1,9 @@ -# ISC23 Tutorial: Efficient Distributed GPU Programming for Exascale +# SC23 Tutorial: Efficient Distributed GPU Programming for Exascale -- Time: Sunday, 21 May 2023 9:00 - 18:00 CEST -- Location: Hall Y8, Congress Center Hamburg, Germany +- Time: Monday, 13 November 2023 8:30 - 17:00 MST +- Location: 405, Denver Congress Centre, USA - Program Link: - https://app.swapcard.com/widget/event/isc-high-performance-2023/planning/UGxhbm5pbmdfMTIyMDc5OA== - + https://sc23.supercomputing.org/presentation/?id=tut140&sess=sess242 ## Hands-On 8-NVSHMEM: Host-initiated Communication with NVSHMEM ### Task: Using NVSHMEM device API @@ -13,15 +12,14 @@ The purpose of this task is to use the NVSHMEM host API instead of MPI to implement a multi-GPU jacobi solver. The starting point of this task is the MPI variant of the jacobi solver. You need to work on `TODOs` in `jacobi.cu`: -- Initialize NVSHMEM: - - Include NVSHMEM headers. - - Initialize NVSHMEM using `MPI_COMM_WORLD`. - - Allocate work arrays `a` and `a_new` from the NVSHMEM symmetric heap. Take care of passing in a consistent size! - - Calculate halo/boundary row index of top and bottom neighbors. - - Add necessary inter PE synchronization. - - Replace MPI periodic boundary conditions with `nvshmemx_float_put_on_stream` to directly push values needed by top and bottom neighbors. - - Deallocate memory from the NVSHMEM symetric heap. - - Finalize NVSHMEM before existing the application +- Include NVSHMEM headers. +- Initialize NVSHMEM using `MPI_COMM_WORLD`. +- Allocate work arrays `a` and `a_new` from the NVSHMEM symmetric heap. Take care of passing in a consistent size! +- Calculate halo/boundary row index of top and bottom neighbors. +- Add necessary inter PE synchronization. +- Replace MPI periodic boundary conditions with `nvshmemx_float_put_on_stream` to directly push values needed by top and bottom neighbors. +- Deallocate memory from the NVSHMEM symmetric heap. +- Finalize NVSHMEM before existing the application Compile with diff --git a/08-H_NCCL_NVSHMEM/solutions/NVSHMEM/jacobi.cu b/08-H_NCCL_NVSHMEM/solutions/NVSHMEM/jacobi.cu index d293c62..dd55b30 100644 --- a/08-H_NCCL_NVSHMEM/solutions/NVSHMEM/jacobi.cu +++ b/08-H_NCCL_NVSHMEM/solutions/NVSHMEM/jacobi.cu @@ -342,17 +342,17 @@ int main(int argc, char* argv[]) { compute_stream)); } - //TODO: Replace MPI communication with Host initiated NVSHMEM calls + //TODO: Replace MPI communication with Host initiated NVSHMEM calls // Apply periodic boundary conditions - PUSH_RANGE("NVSHMEM", 5) - nvshmemx_float_put_on_stream(a_new + iy_top_lower_boundary_idx * nx, a_new + iy_start * nx, nx, top, push_stream); + PUSH_RANGE("NVSHMEM", 5) + nvshmemx_float_put_on_stream(a_new + iy_top_lower_boundary_idx * nx, a_new + iy_start * nx, nx, top, push_stream); nvshmemx_float_put_on_stream(a_new + iy_bottom_upper_boundary_idx * nx, a_new + (iy_end - 1) * nx, nx, bottom, push_stream); - CUDA_RT_CALL(cudaEventRecord(push_done, push_stream)); + CUDA_RT_CALL(cudaEventRecord(push_done, push_stream)); POP_RANGE CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, push_done, 0)); - //TODO: add necessary inter PE synchronization using the nvshmemx_barrier_all_on_stream(...) + //TODO: add necessary inter PE synchronization using the nvshmemx_barrier_all_on_stream(...) nvshmemx_barrier_all_on_stream(compute_stream); if (calculate_norm) { @@ -395,8 +395,8 @@ int main(int argc, char* argv[]) { if (rank == 0 && result_correct) { if (csv) { -//TODO: Replace MPI with NVSHMEM for your output - printf("nvshmem, %d, %d, %d, %d, %d, 1, %f, %f\n", nx, ny, iter_max, nccheck, size, + //TODO: Replace MPI with NVSHMEM for your output + printf("nvshmem, %d, %d, %d, %d, %d, 1, %f, %f\n", nx, ny, iter_max, nccheck, size, (stop - start), runtime_serial); } else { printf("Num GPUs: %d.\n", size); diff --git a/08-H_NCCL_NVSHMEM/tasks/NCCL/Instructions.ipynb b/08-H_NCCL_NVSHMEM/tasks/NCCL/Instructions.ipynb index d147cf5..2bd9d62 100644 --- a/08-H_NCCL_NVSHMEM/tasks/NCCL/Instructions.ipynb +++ b/08-H_NCCL_NVSHMEM/tasks/NCCL/Instructions.ipynb @@ -4,14 +4,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# ISC23 Tutorial: Efficient Distributed GPU Programming for Exascale\n", + "# SC23 Tutorial: Efficient Distributed GPU Programming for Exascale\n", "\n", - "- Time: Sunday, 21 May 2023 9:00 - 18:00 CEST\n", - "- Location: Hall Y8, Congress Center Hamburg, Germany\n", + "- Time: Monday, 13 November 2023 8:30 - 17:00 MST\n", + "- Location: 405, Denver Congress Centre, USA\n", "- Program Link:\n", - " https://app.swapcard.com/widget/event/isc-high-performance-2023/planning/UGxhbm5pbmdfMTIyMDc5OA==\n", - "\n", - "## Hands-On 8-NCCL: Using NCCL for Inter-GPU Communication\n", + " https://sc23.supercomputing.org/presentation/?id=tut140&sess=sess242\n", + " \\## Hands-On 8-NCCL: Using NCCL for Inter-GPU Communication\n", "\n", "### Task: Using NCCL\n", "\n", @@ -22,15 +21,14 @@ "variant of the jacobi solver. You need to work on `TODOs` in\n", "`jacobi.cpp`:\n", "\n", - "- Initialize NCCL:\n", - " - Include NCCL headers.\n", - " - Create a NCCL unique ID, and initialize it\n", - " - Create a NCCL communicator and initilize it\n", - " - Replace the MPI_Sendrecv calls with ncclRecv and ncclSend calls\n", - " for the warmup stage\n", - " - Replace MPI for the periodic boundary conditions with NCCL\n", - " - Fix output message to indicate nccl rather than mpi\n", - " - Destroy NCCL comunicator\n", + "- Include NCCL headers.\n", + "- Create a NCCL unique ID, and initialize it\n", + "- Create a NCCL communicator and initialize it\n", + "- Replace the MPI_Sendrecv calls with ncclRecv and ncclSend calls for\n", + " the warmup stage\n", + "- Replace MPI for the periodic boundary conditions with NCCL\n", + "- Fix output message to indicate nccl rather than mpi\n", + "- Destroy NCCL communicator\n", "\n", "Compile with\n", "\n", @@ -48,7 +46,7 @@ "`make profile`. For `make run` and `make profile` the environment\n", "variable `NP` can be set to change the number of processes." ], - "id": "fcc1dec5-cae8-41fb-a41d-2f5f1e5c5509" + "id": "0179bd70-4d9b-4560-859f-31ecf7708e9e" } ], "nbformat": 4, diff --git a/08-H_NCCL_NVSHMEM/tasks/NCCL/Instructions.md b/08-H_NCCL_NVSHMEM/tasks/NCCL/Instructions.md index fe70698..7368f28 100644 --- a/08-H_NCCL_NVSHMEM/tasks/NCCL/Instructions.md +++ b/08-H_NCCL_NVSHMEM/tasks/NCCL/Instructions.md @@ -1,10 +1,9 @@ -# ISC23 Tutorial: Efficient Distributed GPU Programming for Exascale +# SC23 Tutorial: Efficient Distributed GPU Programming for Exascale -- Time: Sunday, 21 May 2023 9:00 - 18:00 CEST -- Location: Hall Y8, Congress Center Hamburg, Germany +- Time: Monday, 13 November 2023 8:30 - 17:00 MST +- Location: 405, Denver Congress Centre, USA - Program Link: - https://app.swapcard.com/widget/event/isc-high-performance-2023/planning/UGxhbm5pbmdfMTIyMDc5OA== - + https://sc23.supercomputing.org/presentation/?id=tut140&sess=sess242 ## Hands-On 8-NCCL: Using NCCL for Inter-GPU Communication ### Task: Using NCCL @@ -13,14 +12,13 @@ The purpose of this task is to use NCCL instead of MPI to implement a multi-GPU jacobi solver. The starting point of this task is the MPI variant of the jacobi solver. You need to work on `TODOs` in `jacobi.cpp`: -- Initialize NCCL: - - Include NCCL headers. - - Create a NCCL unique ID, and initialize it - - Create a NCCL communicator and initilize it - - Replace the MPI\_Sendrecv calls with ncclRecv and ncclSend calls for the warmup stage - - Replace MPI for the periodic boundary conditions with NCCL - - Fix output message to indicate nccl rather than mpi - - Destroy NCCL comunicator +- Include NCCL headers. +- Create a NCCL unique ID, and initialize it +- Create a NCCL communicator and initialize it +- Replace the MPI\_Sendrecv calls with ncclRecv and ncclSend calls for the warmup stage +- Replace MPI for the periodic boundary conditions with NCCL +- Fix output message to indicate nccl rather than mpi +- Destroy NCCL communicator Compile with diff --git a/08-H_NCCL_NVSHMEM/tasks/NCCL/jacobi.cpp b/08-H_NCCL_NVSHMEM/tasks/NCCL/jacobi.cpp index 3588d21..f99c86a 100644 --- a/08-H_NCCL_NVSHMEM/tasks/NCCL/jacobi.cpp +++ b/08-H_NCCL_NVSHMEM/tasks/NCCL/jacobi.cpp @@ -251,7 +251,7 @@ int main(int argc, char* argv[]) { real* l2_norm_h; CUDA_RT_CALL(cudaMallocHost(&l2_norm_h, sizeof(real))); -//TODO: Rename range + //TODO: Rename range PUSH_RANGE("MPI_Warmup", 5) for (int i = 0; i < 10; ++i) { const int top = rank > 0 ? rank - 1 : (size - 1); @@ -292,7 +292,7 @@ int main(int argc, char* argv[]) { CUDA_RT_CALL(cudaStreamWaitEvent(push_stream, reset_l2norm_done, 0)); calculate_norm = (iter % nccheck) == 0 || (!csv && (iter % 100) == 0); - launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_start + 1), (iy_end - 1), nx, calculate_norm, + launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_start + 1), (iy_end - 1), nx, calculate_norm, compute_stream); launch_jacobi_kernel(a_new, a, l2_norm_d, iy_start, (iy_start + 1), nx, calculate_norm, @@ -312,16 +312,16 @@ int main(int argc, char* argv[]) { const int bottom = (rank + 1) % size; // Apply periodic boundary conditions - //TODO: Modify the lable for the RANGE, and replace MPI_Sendrecv with ncclSend and ncclRecv calls + //TODO: Modify the lable for the RANGE, and replace MPI_Sendrecv with ncclSend and ncclRecv calls // using the nccl communicator and push_stream. // Remember to use ncclGroupStart() and ncclGroupEnd() - PUSH_RANGE("MPI", 5) + PUSH_RANGE("MPI", 5) MPI_CALL(MPI_Sendrecv(a_new + iy_start * nx, nx, MPI_REAL_TYPE, top, 0, a_new + (iy_end * nx), nx, MPI_REAL_TYPE, bottom, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE)); MPI_CALL(MPI_Sendrecv(a_new + (iy_end - 1) * nx, nx, MPI_REAL_TYPE, bottom, 0, a_new, nx, MPI_REAL_TYPE, top, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE)); - CUDA_RT_CALL(cudaEventRecord(push_done, push_stream)); + CUDA_RT_CALL(cudaEventRecord(push_done, push_stream)); POP_RANGE if (calculate_norm) { @@ -366,9 +366,9 @@ int main(int argc, char* argv[]) { if (rank == 0 && result_correct) { if (csv) { - //TODO: Dont forget to change your output lable from mpi_overlap to nccl_overlap - printf("mpi_overlap, %d, %d, %d, %d, %d, 1, %f, %f\n", nx, ny, iter_max, nccheck, size, - (stop - start), runtime_serial); + //TODO: Dont forget to change your output lable from mpi_overlap to nccl_overlap + printf("mpi_overlap, %d, %d, %d, %d, %d, 1, %f, %f\n", nx, ny, iter_max, nccheck, size, + (stop - start), runtime_serial); } else { printf("Num GPUs: %d.\n", size); printf( diff --git a/08-H_NCCL_NVSHMEM/tasks/NVSHMEM/Instructions.ipynb b/08-H_NCCL_NVSHMEM/tasks/NVSHMEM/Instructions.ipynb index b7d479f..c8d814e 100644 --- a/08-H_NCCL_NVSHMEM/tasks/NVSHMEM/Instructions.ipynb +++ b/08-H_NCCL_NVSHMEM/tasks/NVSHMEM/Instructions.ipynb @@ -4,14 +4,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# ISC23 Tutorial: Efficient Distributed GPU Programming for Exascale\n", + "# SC23 Tutorial: Efficient Distributed GPU Programming for Exascale\n", "\n", - "- Time: Sunday, 21 May 2023 9:00 - 18:00 CEST\n", - "- Location: Hall Y8, Congress Center Hamburg, Germany\n", + "- Time: Monday, 13 November 2023 8:30 - 17:00 MST\n", + "- Location: 405, Denver Congress Centre, USA\n", "- Program Link:\n", - " https://app.swapcard.com/widget/event/isc-high-performance-2023/planning/UGxhbm5pbmdfMTIyMDc5OA==\n", - "\n", - "## Hands-On 8-NVSHMEM: Host-initiated Communication with NVSHMEM\n", + " https://sc23.supercomputing.org/presentation/?id=tut140&sess=sess242\n", + " \\## Hands-On 8-NVSHMEM: Host-initiated Communication with NVSHMEM\n", "\n", "### Task: Using NVSHMEM device API\n", "\n", @@ -22,18 +21,17 @@ "is the MPI variant of the jacobi solver. You need to work on `TODOs` in\n", "`jacobi.cu`:\n", "\n", - "- Initialize NVSHMEM:\n", - " - Include NVSHMEM headers.\n", - " - Initialize NVSHMEM using `MPI_COMM_WORLD`.\n", - " - Allocate work arrays `a` and `a_new` from the NVSHMEM symmetric\n", - " heap. Take care of passing in a consistent size!\n", - " - Calculate halo/boundary row index of top and bottom neighbors.\n", - " - Add necessary inter PE synchronization.\n", - " - Replace MPI periodic boundary conditions with\n", - " `nvshmemx_float_put_on_stream` to directly push values needed by\n", - " top and bottom neighbors.\n", - " - Deallocate memory from the NVSHMEM symetric heap.\n", - " - Finalize NVSHMEM before existing the application\n", + "- Include NVSHMEM headers.\n", + "- Initialize NVSHMEM using `MPI_COMM_WORLD`.\n", + "- Allocate work arrays `a` and `a_new` from the NVSHMEM symmetric\n", + " heap. Take care of passing in a consistent size!\n", + "- Calculate halo/boundary row index of top and bottom neighbors.\n", + "- Add necessary inter PE synchronization.\n", + "- Replace MPI periodic boundary conditions with\n", + " `nvshmemx_float_put_on_stream` to directly push values needed by top\n", + " and bottom neighbors.\n", + "- Deallocate memory from the NVSHMEM symmetric heap.\n", + "- Finalize NVSHMEM before existing the application\n", "\n", "Compile with\n", "\n", @@ -64,7 +62,7 @@ "GPUs listed. This is automatically done for the `sanitize`, `run` and\n", "`profile` make targets." ], - "id": "c38ceeb2-bbb5-416b-a9ad-dd1dad8ac0e1" + "id": "688b0b98-419c-4dd3-a60f-f1f1d3018e63" } ], "nbformat": 4, diff --git a/08-H_NCCL_NVSHMEM/tasks/NVSHMEM/Instructions.md b/08-H_NCCL_NVSHMEM/tasks/NVSHMEM/Instructions.md index 38a3b32..13a5729 100644 --- a/08-H_NCCL_NVSHMEM/tasks/NVSHMEM/Instructions.md +++ b/08-H_NCCL_NVSHMEM/tasks/NVSHMEM/Instructions.md @@ -1,10 +1,9 @@ -# ISC23 Tutorial: Efficient Distributed GPU Programming for Exascale +# SC23 Tutorial: Efficient Distributed GPU Programming for Exascale -- Time: Sunday, 21 May 2023 9:00 - 18:00 CEST -- Location: Hall Y8, Congress Center Hamburg, Germany +- Time: Monday, 13 November 2023 8:30 - 17:00 MST +- Location: 405, Denver Congress Centre, USA - Program Link: - https://app.swapcard.com/widget/event/isc-high-performance-2023/planning/UGxhbm5pbmdfMTIyMDc5OA== - + https://sc23.supercomputing.org/presentation/?id=tut140&sess=sess242 ## Hands-On 8-NVSHMEM: Host-initiated Communication with NVSHMEM ### Task: Using NVSHMEM device API @@ -13,15 +12,14 @@ The purpose of this task is to use the NVSHMEM host API instead of MPI to implement a multi-GPU jacobi solver. The starting point of this task is the MPI variant of the jacobi solver. You need to work on `TODOs` in `jacobi.cu`: -- Initialize NVSHMEM: - - Include NVSHMEM headers. - - Initialize NVSHMEM using `MPI_COMM_WORLD`. - - Allocate work arrays `a` and `a_new` from the NVSHMEM symmetric heap. Take care of passing in a consistent size! - - Calculate halo/boundary row index of top and bottom neighbors. - - Add necessary inter PE synchronization. - - Replace MPI periodic boundary conditions with `nvshmemx_float_put_on_stream` to directly push values needed by top and bottom neighbors. - - Deallocate memory from the NVSHMEM symetric heap. - - Finalize NVSHMEM before existing the application +- Include NVSHMEM headers. +- Initialize NVSHMEM using `MPI_COMM_WORLD`. +- Allocate work arrays `a` and `a_new` from the NVSHMEM symmetric heap. Take care of passing in a consistent size! +- Calculate halo/boundary row index of top and bottom neighbors. +- Add necessary inter PE synchronization. +- Replace MPI periodic boundary conditions with `nvshmemx_float_put_on_stream` to directly push values needed by top and bottom neighbors. +- Deallocate memory from the NVSHMEM symmetric heap. +- Finalize NVSHMEM before existing the application Compile with diff --git a/08-H_NCCL_NVSHMEM/tasks/NVSHMEM/jacobi.cu b/08-H_NCCL_NVSHMEM/tasks/NVSHMEM/jacobi.cu index 0359d7e..b754207 100644 --- a/08-H_NCCL_NVSHMEM/tasks/NVSHMEM/jacobi.cu +++ b/08-H_NCCL_NVSHMEM/tasks/NVSHMEM/jacobi.cu @@ -333,7 +333,7 @@ int main(int argc, char* argv[]) { compute_stream)); } - //TODO: Replace MPI communication with Host initiated NVSHMEM calls + //TODO: Replace MPI communication with Host initiated NVSHMEM calls // Apply periodic boundary conditions PUSH_RANGE("MPI", 5) MPI_CALL(MPI_Sendrecv(a_new + iy_start * nx, nx, MPI_REAL_TYPE, top, 0, @@ -341,12 +341,12 @@ int main(int argc, char* argv[]) { MPI_STATUS_IGNORE)); MPI_CALL(MPI_Sendrecv(a_new + (iy_end - 1) * nx, nx, MPI_REAL_TYPE, bottom, 0, a_new, nx, MPI_REAL_TYPE, top, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE)); - CUDA_RT_CALL(cudaEventRecord(push_done, push_stream)); + CUDA_RT_CALL(cudaEventRecord(push_done, push_stream)); POP_RANGE CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, push_done, 0)); - //TODO: add necessary inter PE synchronization using the nvshmemx_barrier_all_on_stream(...) + //TODO: add necessary inter PE synchronization using the nvshmemx_barrier_all_on_stream(...) if (calculate_norm) { CUDA_RT_CALL(cudaStreamSynchronize(compute_stream)); @@ -388,7 +388,7 @@ int main(int argc, char* argv[]) { if (rank == 0 && result_correct) { if (csv) { -//TODO: Replace MPI with NVSHMEM for your output + //TODO: Replace MPI with NVSHMEM for your output printf("mpi, %d, %d, %d, %d, %d, 1, %f, %f\n", nx, ny, iter_max, nccheck, size, (stop - start), runtime_serial); } else { diff --git a/09-L_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/slides.pdf b/09-L_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/slides.pdf index 60b761b..8095d4a 100644 Binary files a/09-L_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/slides.pdf and b/09-L_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/slides.pdf differ diff --git a/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/.master/Device-initiated_Communication_with_NVSHMEM/Instructions.ipynb b/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/.master/Device-initiated_Communication_with_NVSHMEM/Instructions.ipynb index ebc5ea5..d0c4c4a 100644 --- a/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/.master/Device-initiated_Communication_with_NVSHMEM/Instructions.ipynb +++ b/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/.master/Device-initiated_Communication_with_NVSHMEM/Instructions.ipynb @@ -4,14 +4,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# SC22 Tutorial: Efficient Distributed GPU Programming for Exascale\n", + "# SC23 Tutorial: Efficient Distributed GPU Programming for Exascale\n", "\n", - "- Time: Monday, 14 November 2022 8:30AM - 5PM CST\n", - "- Location: D163, Kay Bailey Hutchison Convention Center Dallas\n", + "- Time: Monday, 13 November 2023 8:30 - 17:00 MST\n", + "- Location: 405, Denver Congress Centre, USA\n", "- Program Link:\n", - " https://sc22.supercomputing.org/presentation/?id=tut102&sess=sess196\n", - "\n", - "## Hands-On 10A: Device-initiated Communication with NVSHMEM\n", + " https://sc23.supercomputing.org/presentation/?id=tut140&sess=sess242\n", + " \\## Hands-On 10A: Device-initiated Communication with NVSHMEM\n", "\n", "### Task: Using NVSHMEM device API\n", "\n", @@ -86,7 +85,7 @@ " repository](https://github.com/NVIDIA/multi-gpu-programming-models)\n", " implements the same strategy." ], - "id": "57748efe-edc3-44ae-9321-54c897d82a47" + "id": "3d1de1bf-0ebb-4709-bddc-9e34484b03bb" } ], "nbformat": 4, diff --git a/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/.master/Device-initiated_Communication_with_NVSHMEM/Instructions.md b/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/.master/Device-initiated_Communication_with_NVSHMEM/Instructions.md index 7b31752..2217166 100644 --- a/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/.master/Device-initiated_Communication_with_NVSHMEM/Instructions.md +++ b/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/.master/Device-initiated_Communication_with_NVSHMEM/Instructions.md @@ -1,10 +1,9 @@ -# ISC23 Tutorial: Efficient Distributed GPU Programming for Exascale +# SC23 Tutorial: Efficient Distributed GPU Programming for Exascale -- Time: Sunday, 21 May 2023 9:00 - 18:00 CEST -- Location: Hall Y8, Congress Center Hamburg, Germany +- Time: Monday, 13 November 2023 8:30 - 17:00 MST +- Location: 405, Denver Congress Centre, USA - Program Link: - https://app.swapcard.com/widget/event/isc-high-performance-2023/planning/UGxhbm5pbmdfMTIyMDc5OA== - + https://sc23.supercomputing.org/presentation/?id=tut140&sess=sess242 ## Hands-On 10A: Device-initiated Communication with NVSHMEM ### Task: Using NVSHMEM device API diff --git a/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/.master/Device-initiated_Communication_with_NVSHMEM/copy.mk b/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/.master/Device-initiated_Communication_with_NVSHMEM/copy.mk index df1bb37..fdc71c0 100755 --- a/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/.master/Device-initiated_Communication_with_NVSHMEM/copy.mk +++ b/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/.master/Device-initiated_Communication_with_NVSHMEM/copy.mk @@ -1,7 +1,6 @@ #!/usr/bin/make -f # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -TASKDIR = ../../tasks/Device-initiated_Communication_with_NVSHM -EM +TASKDIR = ../../tasks/Device-initiated_Communication_with_NVSHMEM SOLUTIONDIR = ../../solutions/Device-initiated_Communication_with_NVSHMEM IYPNB_TEMPLATE = ../../../.template.json diff --git a/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/.master/Using_CUDA_Graphs/Instructions.ipynb b/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/.master/Using_CUDA_Graphs/Instructions.ipynb index cd06caa..8c9afe1 100644 --- a/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/.master/Using_CUDA_Graphs/Instructions.ipynb +++ b/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/.master/Using_CUDA_Graphs/Instructions.ipynb @@ -4,14 +4,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# ISC23 Tutorial: Efficient Distributed GPU Programming for Exascale\n", + "# SC23 Tutorial: Efficient Distributed GPU Programming for Exascale\n", "\n", - "- Time: Sunday, 21 May 2023 9:00 - 18:00 CEST\n", - "- Location: Hall Y8, Congress Center Hamburg, Germany\n", + "- Time: Monday, 13 November 2023 8:30 - 17:00 MST\n", + "- Location: 405, Denver Congress Centre, USA\n", "- Program Link:\n", - " https://app.swapcard.com/widget/event/isc-high-performance-2023/planning/UGxhbm5pbmdfMTIyMDc5OA==\n", - "\n", - "## Hands-On 10B: Using CUDA Graphs\n", + " https://sc23.supercomputing.org/presentation/?id=tut140&sess=sess242\n", + " \\## Hands-On 10B: Using CUDA Graphs\n", "\n", "### Task: Combining CUDA Graphs with NCCL for Inter-GPU Communication\n", "\n", @@ -65,7 +64,7 @@ "`make profile`. For `make run` and `make profile` the environment\n", "variable `NP` can be set to change the number of processes." ], - "id": "4865b2a9-edfd-40e8-879d-f503b5b30fd5" + "id": "bc28bb54-3dda-4b0a-9465-f6609051d0cc" } ], "nbformat": 4, diff --git a/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/.master/Using_CUDA_Graphs/Instructions.md b/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/.master/Using_CUDA_Graphs/Instructions.md index 2e3645a..b6e9534 100644 --- a/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/.master/Using_CUDA_Graphs/Instructions.md +++ b/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/.master/Using_CUDA_Graphs/Instructions.md @@ -1,10 +1,9 @@ -# ISC23 Tutorial: Efficient Distributed GPU Programming for Exascale +# SC23 Tutorial: Efficient Distributed GPU Programming for Exascale -- Time: Sunday, 21 May 2023 9:00 - 18:00 CEST -- Location: Hall Y8, Congress Center Hamburg, Germany +- Time: Monday, 13 November 2023 8:30 - 17:00 MST +- Location: 405, Denver Congress Centre, USA - Program Link: - https://app.swapcard.com/widget/event/isc-high-performance-2023/planning/UGxhbm5pbmdfMTIyMDc5OA== - + https://sc23.supercomputing.org/presentation/?id=tut140&sess=sess242 ## Hands-On 10B: Using CUDA Graphs ### Task: Combining CUDA Graphs with NCCL for Inter-GPU Communication diff --git a/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/solutions/Device-initiated_Communication_with_NVSHMEM/Instructions.ipynb b/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/solutions/Device-initiated_Communication_with_NVSHMEM/Instructions.ipynb index ebc5ea5..d0c4c4a 100644 --- a/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/solutions/Device-initiated_Communication_with_NVSHMEM/Instructions.ipynb +++ b/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/solutions/Device-initiated_Communication_with_NVSHMEM/Instructions.ipynb @@ -4,14 +4,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# SC22 Tutorial: Efficient Distributed GPU Programming for Exascale\n", + "# SC23 Tutorial: Efficient Distributed GPU Programming for Exascale\n", "\n", - "- Time: Monday, 14 November 2022 8:30AM - 5PM CST\n", - "- Location: D163, Kay Bailey Hutchison Convention Center Dallas\n", + "- Time: Monday, 13 November 2023 8:30 - 17:00 MST\n", + "- Location: 405, Denver Congress Centre, USA\n", "- Program Link:\n", - " https://sc22.supercomputing.org/presentation/?id=tut102&sess=sess196\n", - "\n", - "## Hands-On 10A: Device-initiated Communication with NVSHMEM\n", + " https://sc23.supercomputing.org/presentation/?id=tut140&sess=sess242\n", + " \\## Hands-On 10A: Device-initiated Communication with NVSHMEM\n", "\n", "### Task: Using NVSHMEM device API\n", "\n", @@ -86,7 +85,7 @@ " repository](https://github.com/NVIDIA/multi-gpu-programming-models)\n", " implements the same strategy." ], - "id": "57748efe-edc3-44ae-9321-54c897d82a47" + "id": "3d1de1bf-0ebb-4709-bddc-9e34484b03bb" } ], "nbformat": 4, diff --git a/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/solutions/Device-initiated_Communication_with_NVSHMEM/Instructions.md b/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/solutions/Device-initiated_Communication_with_NVSHMEM/Instructions.md index 7b31752..2217166 100644 --- a/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/solutions/Device-initiated_Communication_with_NVSHMEM/Instructions.md +++ b/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/solutions/Device-initiated_Communication_with_NVSHMEM/Instructions.md @@ -1,10 +1,9 @@ -# ISC23 Tutorial: Efficient Distributed GPU Programming for Exascale +# SC23 Tutorial: Efficient Distributed GPU Programming for Exascale -- Time: Sunday, 21 May 2023 9:00 - 18:00 CEST -- Location: Hall Y8, Congress Center Hamburg, Germany +- Time: Monday, 13 November 2023 8:30 - 17:00 MST +- Location: 405, Denver Congress Centre, USA - Program Link: - https://app.swapcard.com/widget/event/isc-high-performance-2023/planning/UGxhbm5pbmdfMTIyMDc5OA== - + https://sc23.supercomputing.org/presentation/?id=tut140&sess=sess242 ## Hands-On 10A: Device-initiated Communication with NVSHMEM ### Task: Using NVSHMEM device API diff --git a/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/solutions/Using_CUDA_Graphs/Instructions.ipynb b/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/solutions/Using_CUDA_Graphs/Instructions.ipynb index cd06caa..8c9afe1 100644 --- a/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/solutions/Using_CUDA_Graphs/Instructions.ipynb +++ b/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/solutions/Using_CUDA_Graphs/Instructions.ipynb @@ -4,14 +4,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# ISC23 Tutorial: Efficient Distributed GPU Programming for Exascale\n", + "# SC23 Tutorial: Efficient Distributed GPU Programming for Exascale\n", "\n", - "- Time: Sunday, 21 May 2023 9:00 - 18:00 CEST\n", - "- Location: Hall Y8, Congress Center Hamburg, Germany\n", + "- Time: Monday, 13 November 2023 8:30 - 17:00 MST\n", + "- Location: 405, Denver Congress Centre, USA\n", "- Program Link:\n", - " https://app.swapcard.com/widget/event/isc-high-performance-2023/planning/UGxhbm5pbmdfMTIyMDc5OA==\n", - "\n", - "## Hands-On 10B: Using CUDA Graphs\n", + " https://sc23.supercomputing.org/presentation/?id=tut140&sess=sess242\n", + " \\## Hands-On 10B: Using CUDA Graphs\n", "\n", "### Task: Combining CUDA Graphs with NCCL for Inter-GPU Communication\n", "\n", @@ -65,7 +64,7 @@ "`make profile`. For `make run` and `make profile` the environment\n", "variable `NP` can be set to change the number of processes." ], - "id": "4865b2a9-edfd-40e8-879d-f503b5b30fd5" + "id": "bc28bb54-3dda-4b0a-9465-f6609051d0cc" } ], "nbformat": 4, diff --git a/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/solutions/Using_CUDA_Graphs/Instructions.md b/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/solutions/Using_CUDA_Graphs/Instructions.md index 2e3645a..b6e9534 100644 --- a/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/solutions/Using_CUDA_Graphs/Instructions.md +++ b/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/solutions/Using_CUDA_Graphs/Instructions.md @@ -1,10 +1,9 @@ -# ISC23 Tutorial: Efficient Distributed GPU Programming for Exascale +# SC23 Tutorial: Efficient Distributed GPU Programming for Exascale -- Time: Sunday, 21 May 2023 9:00 - 18:00 CEST -- Location: Hall Y8, Congress Center Hamburg, Germany +- Time: Monday, 13 November 2023 8:30 - 17:00 MST +- Location: 405, Denver Congress Centre, USA - Program Link: - https://app.swapcard.com/widget/event/isc-high-performance-2023/planning/UGxhbm5pbmdfMTIyMDc5OA== - + https://sc23.supercomputing.org/presentation/?id=tut140&sess=sess242 ## Hands-On 10B: Using CUDA Graphs ### Task: Combining CUDA Graphs with NCCL for Inter-GPU Communication diff --git a/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/tasks/Device-initiated_Communication_with_NVSHMEM/Instructions.ipynb b/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/tasks/Device-initiated_Communication_with_NVSHMEM/Instructions.ipynb index ebc5ea5..d0c4c4a 100644 --- a/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/tasks/Device-initiated_Communication_with_NVSHMEM/Instructions.ipynb +++ b/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/tasks/Device-initiated_Communication_with_NVSHMEM/Instructions.ipynb @@ -4,14 +4,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# SC22 Tutorial: Efficient Distributed GPU Programming for Exascale\n", + "# SC23 Tutorial: Efficient Distributed GPU Programming for Exascale\n", "\n", - "- Time: Monday, 14 November 2022 8:30AM - 5PM CST\n", - "- Location: D163, Kay Bailey Hutchison Convention Center Dallas\n", + "- Time: Monday, 13 November 2023 8:30 - 17:00 MST\n", + "- Location: 405, Denver Congress Centre, USA\n", "- Program Link:\n", - " https://sc22.supercomputing.org/presentation/?id=tut102&sess=sess196\n", - "\n", - "## Hands-On 10A: Device-initiated Communication with NVSHMEM\n", + " https://sc23.supercomputing.org/presentation/?id=tut140&sess=sess242\n", + " \\## Hands-On 10A: Device-initiated Communication with NVSHMEM\n", "\n", "### Task: Using NVSHMEM device API\n", "\n", @@ -86,7 +85,7 @@ " repository](https://github.com/NVIDIA/multi-gpu-programming-models)\n", " implements the same strategy." ], - "id": "57748efe-edc3-44ae-9321-54c897d82a47" + "id": "3d1de1bf-0ebb-4709-bddc-9e34484b03bb" } ], "nbformat": 4, diff --git a/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/tasks/Device-initiated_Communication_with_NVSHMEM/Instructions.md b/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/tasks/Device-initiated_Communication_with_NVSHMEM/Instructions.md index 7b31752..2217166 100644 --- a/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/tasks/Device-initiated_Communication_with_NVSHMEM/Instructions.md +++ b/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/tasks/Device-initiated_Communication_with_NVSHMEM/Instructions.md @@ -1,10 +1,9 @@ -# ISC23 Tutorial: Efficient Distributed GPU Programming for Exascale +# SC23 Tutorial: Efficient Distributed GPU Programming for Exascale -- Time: Sunday, 21 May 2023 9:00 - 18:00 CEST -- Location: Hall Y8, Congress Center Hamburg, Germany +- Time: Monday, 13 November 2023 8:30 - 17:00 MST +- Location: 405, Denver Congress Centre, USA - Program Link: - https://app.swapcard.com/widget/event/isc-high-performance-2023/planning/UGxhbm5pbmdfMTIyMDc5OA== - + https://sc23.supercomputing.org/presentation/?id=tut140&sess=sess242 ## Hands-On 10A: Device-initiated Communication with NVSHMEM ### Task: Using NVSHMEM device API diff --git a/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/tasks/Using_CUDA_Graphs/Instructions.ipynb b/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/tasks/Using_CUDA_Graphs/Instructions.ipynb index cd06caa..8c9afe1 100644 --- a/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/tasks/Using_CUDA_Graphs/Instructions.ipynb +++ b/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/tasks/Using_CUDA_Graphs/Instructions.ipynb @@ -4,14 +4,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# ISC23 Tutorial: Efficient Distributed GPU Programming for Exascale\n", + "# SC23 Tutorial: Efficient Distributed GPU Programming for Exascale\n", "\n", - "- Time: Sunday, 21 May 2023 9:00 - 18:00 CEST\n", - "- Location: Hall Y8, Congress Center Hamburg, Germany\n", + "- Time: Monday, 13 November 2023 8:30 - 17:00 MST\n", + "- Location: 405, Denver Congress Centre, USA\n", "- Program Link:\n", - " https://app.swapcard.com/widget/event/isc-high-performance-2023/planning/UGxhbm5pbmdfMTIyMDc5OA==\n", - "\n", - "## Hands-On 10B: Using CUDA Graphs\n", + " https://sc23.supercomputing.org/presentation/?id=tut140&sess=sess242\n", + " \\## Hands-On 10B: Using CUDA Graphs\n", "\n", "### Task: Combining CUDA Graphs with NCCL for Inter-GPU Communication\n", "\n", @@ -65,7 +64,7 @@ "`make profile`. For `make run` and `make profile` the environment\n", "variable `NP` can be set to change the number of processes." ], - "id": "4865b2a9-edfd-40e8-879d-f503b5b30fd5" + "id": "bc28bb54-3dda-4b0a-9465-f6609051d0cc" } ], "nbformat": 4, diff --git a/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/tasks/Using_CUDA_Graphs/Instructions.md b/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/tasks/Using_CUDA_Graphs/Instructions.md index 2e3645a..b6e9534 100644 --- a/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/tasks/Using_CUDA_Graphs/Instructions.md +++ b/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/tasks/Using_CUDA_Graphs/Instructions.md @@ -1,10 +1,9 @@ -# ISC23 Tutorial: Efficient Distributed GPU Programming for Exascale +# SC23 Tutorial: Efficient Distributed GPU Programming for Exascale -- Time: Sunday, 21 May 2023 9:00 - 18:00 CEST -- Location: Hall Y8, Congress Center Hamburg, Germany +- Time: Monday, 13 November 2023 8:30 - 17:00 MST +- Location: 405, Denver Congress Centre, USA - Program Link: - https://app.swapcard.com/widget/event/isc-high-performance-2023/planning/UGxhbm5pbmdfMTIyMDc5OA== - + https://sc23.supercomputing.org/presentation/?id=tut140&sess=sess242 ## Hands-On 10B: Using CUDA Graphs ### Task: Combining CUDA Graphs with NCCL for Inter-GPU Communication diff --git a/11-L_Summary_Advanced/slides.pdf b/11-L_Summary_Advanced/slides.pdf index 1914e74..20ec4d8 100644 Binary files a/11-L_Summary_Advanced/slides.pdf and b/11-L_Summary_Advanced/slides.pdf differ diff --git a/README.md b/README.md index 2deed2f..4bbc5c9 100644 --- a/README.md +++ b/README.md @@ -1,30 +1,40 @@ -# ISC23 Tutorial: Efficient Distributed GPU Programming for Exascale +# SC23 Tutorial: Efficient Distributed GPU Programming for Exascale [![DOI](https://zenodo.org/badge/409504932.svg)](https://zenodo.org/badge/latestdoi/409504932) -Repository with talks and exercises of our Efficient GPU Programming for Exascale tutorial, to be held at [ISC23](https://app.swapcard.com/widget/event/isc-high-performance-2023/planning/UGxhbm5pbmdfMTIyMDc5OA==). +Repository with talks and exercises of our Efficient GPU Programming for Exascale tutorial, to be held at [SC23](https://sc23.supercomputing.org/presentation/?id=tut140&sess=sess242). ## Coordinates -* Date: 21 May 2023 -* Occasion: ISC23 Tutorial +* Date: 13 November 2023 +* Occasion: SC23 Tutorial * Tutors: Simon Garcia (SNL), Andreas Herten (JSC), Markus Hrywniak (NVIDIA), Jiri Kraus (NVIDIA), Lena Oden (Uni Hagen) + ## Setup The tutorial is an interactive tutorial with introducing lectures and practical exercises to apply knowledge. The exercises have been derived from the Jacobi solver implementations available in [NVIDIA/multi-gpu-programming-models](https://github.com/NVIDIA/multi-gpu-programming-models). +Walk-through: + +* Sign up at JuDoor: https://go.fzj.de/mg-jd +* Open Jupyter JSC: https://jupyter-jsc.fz-juelich.de +* Create new Jupyter instance on JUWELS, using training2332 account, on **LoginNodeBooster** +* Source course environment: `source $PROJECT_training2332/env.sh` +* Sync material: `jsc-material-sync` +* Locally install NVIDIA Nsight Systems: https://developer.nvidia.com/nsight-systems + Curriculum: 1. Lecture: Tutorial Overview, Introduction to System + Onboarding *Andreas* 2. Lecture: MPI-Distributed Computing with GPUs *Simon* 3. Hands-on: Multi-GPU Parallelization 4. Lecture: Performance / Debugging Tools *Markus* -5. Lecture: Optimization Techniques for Multi-GPU Applications *Markus* +5. Lecture: Optimization Techniques for Multi-GPU Applications *Simon* 6. Hands-on: Overlap Communication and Computation with MPI -7. Lecture: Overview of NCCL and NVSHMEN in MPI *Lena* +7. Lecture: Overview of NCCL and NVSHMEN in MPI *Jiri* 8. Hands-on: Using NCCL and NVSHMEM -9. Lecture: Device-initiated Communication with NVSHMEM *Lena* +9. Lecture: Device-initiated Communication with NVSHMEM *Jiri* 10. Hands-on: Using Device-Initiated Communication with NVSHMEM 11. Lecture: Conclusion and Outline of Advanced Topics *Andreas*