diff --git a/.ci/docker/Dockerfile b/.ci/docker/Dockerfile
new file mode 100644
index 00000000000..8aefbfe8f47
--- /dev/null
+++ b/.ci/docker/Dockerfile
@@ -0,0 +1,25 @@
+ARG BASE_IMAGE
+FROM ${BASE_IMAGE}
+
+ENV DEBIAN_FRONTEND noninteractive
+
+# Install common dependencies (so that this step can be cached separately)
+COPY ./common/install_base.sh install_base.sh
+RUN bash ./install_base.sh && rm install_base.sh
+
+# Setup user
+# TODO: figure out how to remove this part
+COPY ./common/install_user.sh install_user.sh
+RUN bash ./install_user.sh && rm install_user.sh
+
+COPY ./common/install_docs_reqs.sh install_docs_reqs.sh
+RUN bash ./install_docs_reqs.sh && rm install_docs_reqs.sh
+
+COPY ./common/install_pip_requirements.sh install_pip_requirements.sh
+COPY ./requirements.txt requirements.txt
+RUN bash ./install_pip_requirements.sh && rm install_pip_requirements.sh
+
+RUN ln -s /usr/bin/python3 /usr/bin/python
+
+USER ci-user
+CMD ["bash"]
diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh
new file mode 100755
index 00000000000..f40c45fea3d
--- /dev/null
+++ b/.ci/docker/build.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -exu
+
+IMAGE_NAME="$1"
+shift
+
+export UBUNTU_VERSION="22.04"
+export CUDA_VERSION="12.6.3"
+
+export BASE_IMAGE="nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}"
+echo "Building ${IMAGE_NAME} Docker image"
+
+docker build \
+  --no-cache \
+  --progress=plain \
+  -f Dockerfile \
+  --build-arg BASE_IMAGE="${BASE_IMAGE}" \
+  "$@" \
+  .
diff --git a/.ci/docker/common/install_base.sh b/.ci/docker/common/install_base.sh
new file mode 100644
index 00000000000..3100b550a89
--- /dev/null
+++ b/.ci/docker/common/install_base.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+# Based off of https://github.com/pytorch/pytorch/tree/b52e0bf131a4e55cd987176f9c5a8d2ad6783b4f/.ci/docker
+
+set -ex
+
+install_ubuntu() {
+  # Install common dependencies
+  apt-get update
+  # TODO: Some of these may not be necessary
+  apt-get install -y --no-install-recommends \
+    build-essential \
+    ca-certificates \
+    cmake=3.22* \
+    curl \
+    git \
+    wget \
+    sudo \
+    vim \
+    jq \
+    vim \
+    unzip \
+    gdb \
+    rsync \
+    libssl-dev \
+    p7zip-full \
+    libglfw3 \
+    libglfw3-dev \
+    sox \
+    libsox-dev \
+    libsox-fmt-all \
+    python3-pip \
+    python3-dev
+
+  # Cleanup package manager
+  apt-get autoclean && apt-get clean
+  rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
+}
+
+# Install base packages depending on the base OS
+ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
+case "$ID" in
+  ubuntu)
+    install_ubuntu
+    ;;
+  *)
+    echo "Unable to determine OS..."
+    exit 1
+    ;;
+esac
diff --git a/.ci/docker/common/install_docs_reqs.sh b/.ci/docker/common/install_docs_reqs.sh
new file mode 100644
index 00000000000..541c9976ad1
--- /dev/null
+++ b/.ci/docker/common/install_docs_reqs.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+# Based off of https://github.com/pytorch/pytorch/tree/b52e0bf131a4e55cd987176f9c5a8d2ad6783b4f/.ci/docker
+set -ex
+
+apt-get update
+apt-get install -y gpg-agent
+
+curl --retry 3 -sL https://deb.nodesource.com/setup_20.x | sudo -E bash -
+sudo apt-get install -y nodejs
+
+curl --retry 3 -sS https://dl.yarnpkg.com/debian/pubkey.gpg | sudo apt-key add -
+echo "deb https://dl.yarnpkg.com/debian/ stable main" | sudo tee /etc/apt/sources.list.d/yarn.list
+
+apt-get update
+apt-get install -y --no-install-recommends yarn
+yarn global add katex --prefix /usr/local
+
+sudo apt-get -y install doxygen
+
+apt-get autoclean && apt-get clean
+rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
diff --git a/.ci/docker/common/install_pip_requirements.sh b/.ci/docker/common/install_pip_requirements.sh
new file mode 100644
index 00000000000..a548d200462
--- /dev/null
+++ b/.ci/docker/common/install_pip_requirements.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+set -ex
+
+# Install pip packages
+pip install --upgrade pip
+pip install -r ./requirements.txt
diff --git a/.ci/docker/common/install_user.sh b/.ci/docker/common/install_user.sh
new file mode 100644
index 00000000000..6deb62086bc
--- /dev/null
+++ b/.ci/docker/common/install_user.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Copied from https://github.com/pytorch/executorch/blob/6e431355a554e5f84c3a05dfa2b981ead90c2b48/.ci/docker/common/install_user.sh#L1
+
+set -ex
+
+# Same as ec2-user
+echo "ci-user:x:1000:1000::/var/lib/ci-user:" >> /etc/passwd
+echo "ci-user:x:1000:" >> /etc/group
+# Needed on Focal or newer
+echo "ci-user:*:19110:0:99999:7:::" >> /etc/shadow
+
+# Create $HOME
+mkdir -p /var/lib/ci-user
+chown ci-user:ci-user /var/lib/ci-user
+
+# Allow sudo
+echo 'ci-user ALL=(ALL) NOPASSWD:ALL' > /etc/sudoers.d/ci-user
+
+# Test that sudo works
+sudo -u ci-user sudo -v
diff --git a/.ci/docker/requirements.txt b/.ci/docker/requirements.txt
new file mode 100644
index 00000000000..f2dbaf2e28a
--- /dev/null
+++ b/.ci/docker/requirements.txt
@@ -0,0 +1,78 @@
+# --extra-index-url https://download.pytorch.org/whl/cu117/index.html # Use this to run/publish tutorials against the latest binaries during the RC stage. Comment out after the release. Each release verify the correct cuda version.
+# Refer to ./jenkins/build.sh for tutorial build instructions.
+
+# Sphinx dependencies
+sphinx==7.2.6
+sphinx-gallery==0.19.0
+sphinx-reredirects==0.1.4
+sphinx_design==0.6.1
+docutils>=0.18.1,<0.21
+sphinx-copybutton==0.5.2
+sphinx_sitemap==2.7.1
+sphinxcontrib-mermaid==1.0.0
+sphinxcontrib.katex==0.9.10
+pypandoc==1.15
+pandocfilters==1.5.1
+markdown==3.8.2
+
+# PyTorch Theme
+git+https://github.com/pytorch/pytorch_sphinx_theme.git@bbe196cbc5037d69d89ad1bf079e2afa3b3e9611#egg=pytorch_sphinx_theme2
+
+# Tutorial dependencies
+tqdm==4.66.1
+numpy==1.24.4
+matplotlib
+librosa
+torch==2.8
+torchvision
+torchdata
+networkx
+PyHamcrest
+bs4
+awscliv2==2.1.1
+flask
+spacy==3.4.1
+ray[tune]==2.7.2
+tensorboard
+jinja2==3.1.3
+pytorch-lightning
+torchx
+torchrl==0.9.2
+tensordict==0.9.1
+# For ax_multiobjective_nas_tutorial.py
+ax-platform>=0.4.0,<0.5.0
+nbformat>=5.9.2
+datasets
+transformers
+onnx
+onnxscript>=0.2.2
+onnxruntime
+evaluate
+accelerate>=0.20.1
+
+importlib-metadata==6.8.0
+
+ipython
+
+sphinxcontrib.katex
+# to run examples
+boto3
+pandas
+requests
+scikit-image
+scipy==1.11.1
+numba==0.57.1
+pillow==10.2.0
+wget
+gym==0.26.2
+gym-super-mario-bros==7.4.0
+pyopengl
+gymnasium[mujoco]==0.27.0
+timm
+pygame==2.6.0
+pycocotools
+semilearn==0.3.2
+torchao==0.10.0
+segment_anything==1.0
+torchrec==1.2.0; platform_system == "Linux"
+fbgemm-gpu==1.2.0; platform_system == "Linux"
diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile
new file mode 100644
index 00000000000..4928e536acf
--- /dev/null
+++ b/.devcontainer/Dockerfile
@@ -0,0 +1,8 @@
+FROM mcr.microsoft.com/vscode/devcontainers/python:3.8
+
+COPY requirements.txt /tmp/pip-tmp/
+
+RUN apt-get update && export DEBIAN_FRONTEND=noninteractive \
+    && apt-get install git gcc unzip make -y \
+    && pip3 install --disable-pip-version-check --no-cache-dir -r /tmp/pip-tmp/requirements.txt \
+    && rm -rf /tmp/pip-tmp
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
new file mode 100644
index 00000000000..86fe20483c5
--- /dev/null
+++ b/.devcontainer/devcontainer.json
@@ -0,0 +1,18 @@
+{
+  "name": "PyTorch Tutorials",
+  "build": {
+    "context": "..",
+    "dockerfile": "Dockerfile",
+    "args": {}
+  },
+  "settings": {
+    "terminal.integrated.shell.linux": "/bin/bash",
+    "workbench.startupEditor": "none",
+    "files.autoSave": "afterDelay",
+    "python.dataScience.enabled": true,
+    "python.dataScience.alwaysTrustNotebooks": true,
+    "python.insidersChannel": "weekly",
+    "python.showStartPage": false
+  },
+  "extensions": ["ms-python.python", "lextudio.restructuredtext"]
+}
diff --git a/.devcontainer/requirements.txt b/.devcontainer/requirements.txt
new file mode 100644
index 00000000000..2be1df895be
--- /dev/null
+++ b/.devcontainer/requirements.txt
@@ -0,0 +1,31 @@
+# Refer to ./jenkins/build.sh for tutorial build instructions
+
+sphinx==1.8.2
+sphinx-gallery==0.3.1
+tqdm
+numpy
+matplotlib
+torch
+torchvision
+torchtext
+torchaudio
+PyHamcrest
+bs4
+awscli==1.16.35
+flask
+spacy
+ray[tune]
+
+# PyTorch Theme
+-e git+https://github.com/pytorch/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme
+
+ipython
+
+# to run examples
+pandas
+scikit-image
+pillow==10.3.0
+wget
+
+# for codespaces env
+pylint
diff --git a/.github/ISSUE_TEMPLATE/bug-report.yml b/.github/ISSUE_TEMPLATE/bug-report.yml
new file mode 100644
index 00000000000..937417f4999
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/bug-report.yml
@@ -0,0 +1,60 @@
+name: 🐛 Bug Report
+description: Create a tutorial bug report
+title: "[BUG] - <title>"
+labels: [
+  "bug"
+]
+
+body:
+- type: markdown
+  attributes:
+    value: >
+      #### Before submitting a bug, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/pytorch/tutorials/issues?q=is%3Aissue+sort%3Acreated-desc+).
+- type: textarea
+  attributes:
+    label: Add Link
+    description: |
+      **Add the link to the tutorial***
+    placeholder: |
+      Link to the tutorial on the website:
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: Describe the bug
+    description: |
+      **Add the bug description**
+    placeholder: |
+      Provide a detailed description of the issue with code samples if relevant
+      ```python
+
+      # Sample code to reproduce the problem if relevant
+      ```
+
+      **Expected Result:** (Describe what you were expecting to see)
+
+
+      **Actual Result:** (Describe the result)
+
+      ```
+      The error message you got, with the full traceback.
+      ```
+
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: Describe your environment
+    description: |
+      **Describe the environment you encountered the bug in:**
+    placeholder: |
+      * Platform (i.e macOS, Linux, Google Colab):
+      * CUDA (yes/no, version?):
+      * PyTorch version (run `python -c "import torch; print(torch.__version__)"`):
+
+  validations:
+    required: true
+- type: markdown
+  attributes:
+    value: >
+      Thanks for contributing 🎉!
diff --git a/.github/ISSUE_TEMPLATE/feature-request.yml b/.github/ISSUE_TEMPLATE/feature-request.yml
new file mode 100644
index 00000000000..c1c449c29fe
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/feature-request.yml
@@ -0,0 +1,37 @@
+name: 🚀 Feature request
+description: Submit a proposal for a new PyTorch tutorial or improvement of an existing tutorial
+title: "💡 [REQUEST] - <title>"
+labels: [
+  "feature"
+]
+
+body:
+- type: textarea
+  attributes:
+    label: 🚀 Describe the improvement or the new tutorial
+    description: |
+      **Describe the improvement**
+    placeholder: |
+      Explain why this improvement or new tutorial is important. For example, *"This tutorial will help users to better understand feature X of PyTorch."* If there is a tutorial that you propose to replace, add here. If this is related to another GitHub issue, add a link here.
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: Existing tutorials on this topic
+    description: |
+      **Add a list of existing tutorials on the same topic.**
+    placeholder: |
+      List tutorials that already explain this functionality if exist. On pytorch.org or elsewhere.
+      * Link
+      * Link
+- type: textarea
+  attributes:
+    label: Additional context
+    description: |
+      **Add additional context**
+    placeholder: |
+      Add any other context or screenshots about the feature request.
+- type: markdown
+  attributes:
+    value: >
+      Thanks for contributing 🎉!
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
new file mode 100644
index 00000000000..8c3604b99fb
--- /dev/null
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -0,0 +1,11 @@
+Fixes #ISSUE_NUMBER
+
+## Description
+<!--- Describe your changes in detail -->
+
+## Checklist
+<!--- Make sure to add `x` to all items in the following checklist: -->
+- [ ] The issue that is being fixed is referred in the description (see above "Fixes #ISSUE_NUMBER")
+- [ ] Only one issue is addressed in this pull request
+- [ ] Labels from the issue that this PR is fixing are added to this pull request
+- [ ] No unnecessary issues are included into this pull request.
diff --git a/.github/pytorch-probot.yml b/.github/pytorch-probot.yml
new file mode 100644
index 00000000000..6d0e8803efb
--- /dev/null
+++ b/.github/pytorch-probot.yml
@@ -0,0 +1 @@
+tracking_issue: 1896
diff --git a/.github/scripts/check_redirects.sh b/.github/scripts/check_redirects.sh
new file mode 100755
index 00000000000..6aa31819820
--- /dev/null
+++ b/.github/scripts/check_redirects.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+
+if [ "$CURRENT_BRANCH" == "$BASE_BRANCH" ]; then
+  echo "Running on $BASE_BRANCH branch. Skipping check."
+  exit 0
+fi
+
+
+# Get list of deleted or renamed files in this branch compared to base
+DELETED_FILES=$(git diff --name-status $BASE_BRANCH $CURRENT_BRANCH --diff-filter=DR | awk '{print $2}' | grep -E '\.(rst|py|md)$' | grep -v 'redirects.py')
+# Check if any deleted or renamed files were found
+if [ -z "$DELETED_FILES" ]; then
+  echo "No deleted or renamed files found. Skipping check."
+  exit 0
+fi
+
+echo "Deleted or renamed files:"
+echo "$DELETED_FILES"
+
+# Check if redirects.py has been updated
+REDIRECTS_UPDATED=$(git diff --name-status $BASE_BRANCH $CURRENT_BRANCH --diff-filter=AM | grep 'redirects.py' && echo "yes" || echo "no")
+
+if [ "$REDIRECTS_UPDATED" == "no" ]; then
+  echo "ERROR: Files were deleted or renamed but redirects.py was not updated. Please update .github/scripts/redirects.py to redirect these files."
+  exit 1
+fi
+
+# Check if each deleted file has a redirect entry
+MISSING_REDIRECTS=0
+for FILE in $DELETED_FILES; do
+  # Convert file path to URL path format (remove extension and adjust path)
+  REDIRECT_PATH=$(echo $FILE | sed -E 's/(.+)_source\/(.+)\.(py|rst|md)$/\1\/\2.html/')
+
+  # Check if this path exists in redirects.py as a key. We don't check for values.
+  if ! grep -q "\"$REDIRECT_PATH\":" redirects.py; then
+    echo "ERROR: Missing redirect for deleted file: $FILE (should have entry for \"$REDIRECT_PATH\")"
+    MISSING_REDIRECTS=1
+  fi
+done
+
+if [ $MISSING_REDIRECTS -eq 1 ]; then
+  echo "ERROR: Please add redirects for all deleted/renamed files to redirects.py"
+  exit 1
+fi
+
+echo "All deleted/renamed files have proper redirects. Check passed!"
diff --git a/.github/scripts/docathon-label-sync.py b/.github/scripts/docathon-label-sync.py
new file mode 100644
index 00000000000..7241e1370ce
--- /dev/null
+++ b/.github/scripts/docathon-label-sync.py
@@ -0,0 +1,46 @@
+import os
+from github import Github
+import sys
+import re
+
+def main():
+    token = os.environ.get('GITHUB_TOKEN')
+
+    repo_owner = "pytorch"
+    repo_name = "tutorials"
+    pull_request_number = int(sys.argv[1])
+
+    g = Github(token)
+    repo = g.get_repo(f'{repo_owner}/{repo_name}')
+    pull_request = repo.get_pull(pull_request_number)
+    pull_request_body = pull_request.body
+    # PR without description
+    if pull_request_body is None:
+        return
+
+    # get issue number from the PR body
+    if not re.search(r'#\d{1,5}', pull_request_body):
+        print("The pull request does not mention an issue.")
+        return
+    issue_number = int(re.findall(r'#(\d{1,5})', pull_request_body)[0])
+    issue = repo.get_issue(issue_number)
+    issue_labels = issue.labels
+    docathon_label_present = any(label.name == 'docathon-h1-2025' for label in issue_labels)
+
+    # if the issue has a docathon label, add all labels from the issue to the PR.
+    if not docathon_label_present:
+        print("The 'docathon-h1-2025' label is not present in the issue.")
+        return
+    pull_request_labels = pull_request.get_labels()
+    issue_label_names = [label.name for label in issue_labels]
+    labels_to_add = [label for label in issue_label_names if label not in pull_request_labels]
+    if not labels_to_add:
+        print("The pull request already has the same labels.")
+        return
+    pull_request.add_to_labels(*labels_to_add)
+    print("Labels added to the pull request!")
+
+
+
+if __name__  == "__main__":
+    main()
diff --git a/.github/workflows/MonthlyLinkCheck.yml b/.github/workflows/MonthlyLinkCheck.yml
new file mode 100644
index 00000000000..aaddcda926f
--- /dev/null
+++ b/.github/workflows/MonthlyLinkCheck.yml
@@ -0,0 +1,44 @@
+#Runs once a month and checks links in the repo to ensure they are valid
+#If action fails, it creates an issue with the failing links and an "incorrect link" label
+#If link is valid but failing, it can be added to the .lycheeignore file
+#Action can also be run manually as needed.
+
+
+name: Monthly Link Check
+on:
+  schedule:
+    - cron: '0 0 1 * *'  # Runs at midnight on the first day of every month
+  workflow_dispatch:  # Allows manual triggering of the workflow
+jobs:
+  linkChecker:
+    runs-on: ubuntu-latest
+    permissions:
+      issues: write
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 1
+      - name: Check Links
+        id: lychee
+        uses: lycheeverse/lychee-action@v2
+        with:
+          args: --accept=200,403,429 --base . --verbose --no-progress './**/*.md' './**/*.html' './**/*.rst'
+          token: ${{ secrets.CUSTOM_TOKEN }}
+          fail: true
+
+      - name: Create Issue From File
+        if: failure() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
+        uses: peter-evans/create-issue-from-file@v5
+        with:
+          title: Broken links detected in docs 🔗
+          content-filepath: ./lychee/out.md
+          labels: 'incorrect link'
+          #token: ${{ secrets.CUSTOM_TOKEN }}
+
+
+      - name: Suggestions
+        if: failure()
+        run: |
+          echo -e "\nPlease review the links reported in the Check links step above."
+          echo -e "If a link is valid but fails due to a CAPTCHA challenge, IP blocking, login requirements, etc., consider adding such links to .lycheeignore file to bypass future checks.\n"
+          exit 1
diff --git a/.github/workflows/StalePRs.yml b/.github/workflows/StalePRs.yml
new file mode 100644
index 00000000000..e7393948518
--- /dev/null
+++ b/.github/workflows/StalePRs.yml
@@ -0,0 +1,156 @@
+# A workflow copied from the pytorch/pytorch repo stale PRs that implements similar logic to actions/stale.
+#
+# Compared to actions/stale, it is implemented to make API requests proportional
+# to the number of stale PRs, not the total number of issues in the repo. This
+# is because PyTorch has a lot of issues/PRs, so the actions/stale runs into
+# rate limits way too quickly.
+#
+# The behavior is:
+# - If a PR is not labeled stale, after 60 days inactivity label the PR as stale and comment about it.
+# - If a PR is labeled stale, after 30 days inactivity close the PR.
+# - `high priority` and `no-stale` PRs are exempt.
+
+name: Close stale pull requests
+
+on:
+  schedule:
+    # Run at midnight UTC.
+    - cron: '0 0 * * *'
+  workflow_dispatch:
+
+jobs:
+  stale:
+    if: ${{ github.repository == 'pytorch/tutorials' }}
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      pull-requests: write
+
+    steps:
+      - uses: actions/github-script@v6
+        with:
+          script: |
+            // Do some dumb retries on requests.
+            const retries = 7;
+            const baseBackoff = 100;
+            const sleep = timeout => new Promise(resolve => setTimeout(resolve, timeout));
+            github.hook.wrap('request', async (request, options) => {
+              for (let attempt = 1; attempt <= retries; attempt++) {
+                try {
+                  return await request(options);
+                } catch (err) {
+                  if (attempt < retries) {
+                    core.warning(`Request getting retried. Attempt: ${attempt}`);
+                    await sleep(baseBackoff * Math.pow(2, attempt));
+                    continue;
+                  }
+                  throw err;
+                }
+              }
+            });
+
+            const MAX_API_REQUESTS = 100;
+
+            // If a PRs not labeled stale, label them stale after no update for 60 days.
+            const STALE_LABEL_THRESHOLD_MS = 1000 * 60 * 60 * 24 * 60;
+            // For PRs already labeled stale, close after not update for 30 days.
+            const STALE_CLOSE_THRESHOLD_MS = 1000 * 60 * 60 * 24 * 30;
+
+            const STALE_MESSAGE =
+              "Looks like this PR hasn't been updated in a while so we're going to go ahead and mark this as `stale`. <br>" +
+              "Feel free to remove the `stale` label if you feel this was a mistake. <br>" +
+              "If you are unable to remove the `stale` label please contact a maintainer in order to do so. <br>" +
+              "If you want the bot to never mark this PR stale again, add the `no-stale` label.<br>" +
+              "`stale` pull requests will automatically be closed after 30 days of inactivity.<br>";
+
+            let numAPIRequests = 0;
+            let numProcessed = 0;
+
+            async function processPull(pull) {
+              core.info(`[${pull.number}] URL: ${pull.html_url}`);
+              numProcessed += 1;
+              const labels = pull.labels.map((label) => label.name);
+
+              // Skip if certain labels are present.
+              if (labels.includes("no-stale") || labels.includes("high priority")) {
+                core.info(`[${pull.number}] Skipping because PR has an exempting label.`);
+                return false;
+              }
+
+              // Check if the PR is stale, according to our configured thresholds.
+              let staleThresholdMillis;
+              if (labels.includes("stale")) {
+                core.info(`[${pull.number}] PR is labeled stale, checking whether we should close it.`);
+                staleThresholdMillis = STALE_CLOSE_THRESHOLD_MS;
+              } else {
+                core.info(`[${pull.number}] Checking whether to label PR as stale.`);
+                staleThresholdMillis = STALE_LABEL_THRESHOLD_MS;
+              }
+
+              const millisSinceLastUpdated =
+                new Date().getTime() - new Date(pull.updated_at).getTime();
+
+              if (millisSinceLastUpdated < staleThresholdMillis) {
+                core.info(`[${pull.number}] Skipping because PR was updated recently`);
+                return false;
+              }
+
+              // At this point, we know we should do something.
+              // For PRs already labeled stale, close them.
+              if (labels.includes("stale")) {
+                core.info(`[${pull.number}] Closing PR.`);
+                numAPIRequests += 1;
+                await github.rest.issues.update({
+                  owner: "pytorch",
+                  repo: "tutorials",
+                  issue_number: pull.number,
+                  state: "closed",
+                });
+              } else {
+                // For PRs not labeled stale, label them stale.
+                core.info(`[${pull.number}] Labeling PR as stale.`);
+
+                numAPIRequests += 1;
+                await github.rest.issues.createComment({
+                  owner: "pytorch",
+                  repo: "tutorials",
+                  issue_number: pull.number,
+                  body: STALE_MESSAGE,
+                });
+
+                numAPIRequests += 1;
+                await github.rest.issues.addLabels({
+                  owner: "pytorch",
+                  repo: "tutorials",
+                  issue_number: pull.number,
+                  labels: ["stale"],
+                });
+              }
+            }
+
+            for await (const response of github.paginate.iterator(
+              github.rest.pulls.list,
+              {
+                owner: "pytorch",
+                repo: "tutorials",
+                state: "open",
+                sort: "created",
+                direction: "asc",
+                per_page: 100,
+              }
+            )) {
+              numAPIRequests += 1;
+              const pulls = response.data;
+              // Awaiting in a loop is intentional here. We want to serialize execution so
+              // that log groups are printed correctl
+              for (const pull of pulls) {
+                if (numAPIRequests > MAX_API_REQUESTS) {
+                  core.warning("Max API requests exceeded, exiting.");
+                  process.exit(0);
+                }
+                await core.group(`Processing PR #${pull.number}`, async () => {
+                  await processPull(pull);
+                });
+              }
+            }
+            core.info(`Processed ${numProcessed} PRs total.`);
diff --git a/.github/workflows/build-tutorials.yml b/.github/workflows/build-tutorials.yml
new file mode 100644
index 00000000000..94cfd5843a0
--- /dev/null
+++ b/.github/workflows/build-tutorials.yml
@@ -0,0 +1,183 @@
+name: Build tutorials
+
+on:
+  pull_request:
+  push:
+    branches:
+      - main
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
+  cancel-in-progress: true
+
+jobs:
+  worker:
+    name: pytorch_tutorial_build_worker
+    strategy:
+      matrix:
+        include:
+          - { shard: 1, num_shards: 15, runner: "linux.g5.12xlarge.nvidia.gpu" }
+          - { shard: 2, num_shards: 15, runner: "linux.g5.4xlarge.nvidia.gpu" }
+          - { shard: 3, num_shards: 15, runner: "linux.g5.4xlarge.nvidia.gpu" }
+          - { shard: 4, num_shards: 15, runner: "linux.g5.4xlarge.nvidia.gpu" }
+          - { shard: 5, num_shards: 15, runner: "linux.g5.4xlarge.nvidia.gpu" }
+          - { shard: 6, num_shards: 15, runner: "linux.g5.4xlarge.nvidia.gpu" }
+          - { shard: 7, num_shards: 15, runner: "linux.g5.4xlarge.nvidia.gpu" }
+          - { shard: 8, num_shards: 15, runner: "linux.g5.4xlarge.nvidia.gpu" }
+          - { shard: 9, num_shards: 15, runner: "linux.g5.4xlarge.nvidia.gpu" }
+          - { shard: 10, num_shards: 15, runner: "linux.g5.4xlarge.nvidia.gpu" }
+          - { shard: 11, num_shards: 15, runner: "linux.g5.4xlarge.nvidia.gpu" }
+          - { shard: 12, num_shards: 15, runner: "linux.g5.4xlarge.nvidia.gpu" }
+          - { shard: 13, num_shards: 15, runner: "linux.g5.4xlarge.nvidia.gpu" }
+          - { shard: 14, num_shards: 15, runner: "linux.g5.4xlarge.nvidia.gpu" }
+          - { shard: 15, num_shards: 15, runner: "linux.g5.4xlarge.nvidia.gpu" }
+      fail-fast: false
+    runs-on: ${{ matrix.runner }}
+    steps:
+      - name: Setup SSH (Click me for login details)
+        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+          instructions: |
+            All testing is done inside the container, to start an interactive session run:
+              docker exec -it $(docker container ps --format '{{.ID}}') bash
+
+      - name: Checkout Tutorials
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+
+      - name: Setup Linux
+        uses: pytorch/pytorch/.github/actions/setup-linux@main
+
+      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
+        uses: pytorch/test-infra/.github/actions/setup-nvidia@main
+
+      - name: Calculate/build docker image
+        id: calculate-docker-image
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        with:
+          docker-image-name: tutorials
+
+      - name: Pull docker image
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        with:
+          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
+
+      - name: Build
+        shell: bash
+        env:
+          DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
+          NUM_WORKERS: ${{ matrix.num_shards }}
+          WORKER_ID: ${{ matrix.shard }}
+          COMMIT_ID: ${{ github.sha }}
+          JOB_TYPE: worker
+          COMMIT_SOURCE: ${{ github.ref }}
+        run: |
+          set -ex
+
+          chmod +x ".jenkins/build.sh"
+
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e WORKER_ID \
+            -e NUM_WORKERS \
+            -e COMMIT_ID \
+            -e JOB_TYPE \
+            -e COMMIT_SOURCE \
+            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
+            --tty \
+            --detach \
+            --shm-size=2gb \
+            --name="${container_name}" \
+            -v "${GITHUB_WORKSPACE}:/var/lib/workspace" \
+            -w /var/lib/workspace \
+            "${DOCKER_IMAGE}"
+          )
+
+          docker exec -u ci-user -t "${container_name}" sh -c ".jenkins/build.sh"
+
+      - name: Teardown Linux
+        uses: pytorch/test-infra/.github/actions/teardown-linux@main
+        if: always()
+
+  manager:
+    name: pytorch_tutorial_build_manager
+    needs: worker
+    runs-on: [self-hosted, linux.2xlarge]
+    environment: ${{ github.ref == 'refs/heads/main' && 'pytorchbot-env' || '' }}
+    steps:
+      - name: Setup SSH (Click me for login details)
+        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+          instructions: |
+            All testing is done inside the container, to start an interactive session run:
+              docker exec -it $(docker container ps --format '{{.ID}}') bash
+
+      - name: Checkout Tutorials
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+
+      - name: Setup Linux
+        uses: pytorch/pytorch/.github/actions/setup-linux@main
+
+      - name: Calculate/build docker image
+        id: calculate-docker-image
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        with:
+          docker-image-name: tutorials
+
+      - name: Pull docker image
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        with:
+          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
+
+      - name: Build
+        shell: bash
+        env:
+          DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
+          NUM_WORKERS: 15
+          WORKER_ID: ${{ matrix.shard }}
+          COMMIT_ID: ${{ github.sha }}
+          JOB_TYPE: manager
+          COMMIT_SOURCE: ${{ github.ref }}
+          GITHUB_PYTORCHBOT_TOKEN: ${{ secrets.PYTORCHBOT_TOKEN }}
+        run: |
+          set -ex
+
+          chmod +x ".jenkins/build.sh"
+
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e WORKER_ID \
+            -e NUM_WORKERS \
+            -e COMMIT_ID \
+            -e JOB_TYPE \
+            -e COMMIT_SOURCE \
+            -e GITHUB_PYTORCHBOT_TOKEN \
+            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
+            --tty \
+            --detach \
+            --name="${container_name}" \
+            -v "${GITHUB_WORKSPACE}:/var/lib/workspace" \
+            -w /var/lib/workspace \
+            "${DOCKER_IMAGE}"
+          )
+
+          docker exec -u ci-user -t "${container_name}" sh -c ".jenkins/build.sh"
+
+      - name: Upload docs preview
+        uses: seemethere/upload-artifact-s3@v5
+        if: ${{ github.event_name == 'pull_request' }}
+        with:
+          retention-days: 14
+          s3-bucket: doc-previews
+          if-no-files-found: error
+          path: docs
+          s3-prefix: pytorch/tutorials/${{ github.event.pull_request.number }}
+
+      - name: Teardown Linux
+        uses: pytorch/test-infra/.github/actions/teardown-linux@main
+        if: always()
diff --git a/.github/workflows/check-redirects.yml b/.github/workflows/check-redirects.yml
new file mode 100644
index 00000000000..380e3989bf4
--- /dev/null
+++ b/.github/workflows/check-redirects.yml
@@ -0,0 +1,25 @@
+name: Check Redirects for Deleted or Renamed Files
+
+on:
+  pull_request:
+    paths:
+     - '*/**/*.rst'
+     - '*/**/*.py'
+     - '*/**/*.md'
+
+jobs:
+  check-redirects:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Run redirect check script
+        run: |
+            chmod +x ./.github/scripts/check_redirects.sh
+            ./.github/scripts/check_redirects.sh
+        env:
+          BASE_BRANCH: ${{ github.base_ref }}
+          CURRENT_BRANCH: ${{ github.head_ref }}
diff --git a/.github/workflows/docathon-assign.yml b/.github/workflows/docathon-assign.yml
new file mode 100644
index 00000000000..8eef2b2fc88
--- /dev/null
+++ b/.github/workflows/docathon-assign.yml
@@ -0,0 +1,60 @@
+name: Assign User on Comment
+
+on:
+  workflow_dispatch:
+  issue_comment:
+    types: [created]
+
+jobs:
+  assign:
+    runs-on: ubuntu-latest
+    permissions:
+      issues: write
+    steps:
+      - name: Check for "/assigntome" in comment
+        uses: actions/github-script@v6
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          script: |
+            const issueComment = context.payload.comment.body;
+            const assignRegex = /\/assigntome/i;
+            if (assignRegex.test(issueComment)) {
+              const assignee = context.payload.comment.user.login;
+              const issueNumber = context.payload.issue.number;
+              try {
+                const { data: issue } = await github.rest.issues.get({
+                  owner: context.repo.owner,
+                  repo: context.repo.repo,
+                  issue_number: issueNumber
+                });
+              const hasLabel = issue.labels.some(label => label.name === 'docathon-h1-2025');
+              if (hasLabel) {
+                if (issue.assignee !== null) {
+                  await github.rest.issues.createComment({
+                    owner: context.repo.owner,
+                    repo: context.repo.repo,
+                    issue_number: issueNumber,
+                    body: "The issue is already assigned. Please pick an opened and unnasigned issue with the [docathon-h1-2025 label](https://github.com/pytorch/pytorch/issues?q=is%3Aopen+is%3Aissue+label%3Adocathon-h1-2025)."
+                  });
+                } else {
+                  await github.rest.issues.addAssignees({
+                    owner: context.repo.owner,
+                    repo: context.repo.repo,
+                    issue_number: issueNumber,
+                    assignees: [assignee]
+                  });
+                }
+              } else {
+                const commmentMessage = "This issue does not have the correct label. Please pick an opened and unnasigned issue with the [docathon-h1-2025 label](https://github.com/pytorch/pytorch/issues?q=is%3Aopen+is%3Aissue+label%3Adocathon-h1-2025)."
+                await github.rest.issues.createComment({
+                  owner: context.repo.owner,
+                  repo: context.repo.repo,
+                  issue_number: issueNumber,
+                  body: commmentMessage
+                });
+               }
+              } catch (error) {
+                console.error(error);
+              }
+            }
diff --git a/.github/workflows/docathon-label-sync.yml b/.github/workflows/docathon-label-sync.yml
new file mode 100644
index 00000000000..1b33bebaac2
--- /dev/null
+++ b/.github/workflows/docathon-label-sync.yml
@@ -0,0 +1,29 @@
+name: Docathon Labels Sync
+
+on:
+  pull_request_target:
+    types: [opened, synchronize, edited]
+
+jobs:
+  check-labels:
+    runs-on: ubuntu-latest
+    permissions:
+      issues: write
+      pull-requests: write
+    steps:
+      - name: Check if PR mentions an issue and get labels
+        uses: actions/checkout@v2
+        with:
+          fetch-depth: 1
+      - name: Set up Python
+        uses: actions/setup-python@v2
+        with:
+          python-version: 3.x
+      - name: Install dependencies
+        run: |
+          pip install requests
+          pip install PyGithub
+      - name: Run Python script
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: python ./.github/scripts/docathon-label-sync.py ${{ github.event.pull_request.number }}
diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml
new file mode 100644
index 00000000000..6d75d1fc929
--- /dev/null
+++ b/.github/workflows/docker-build.yml
@@ -0,0 +1,59 @@
+name: Docker Build
+
+on:
+  workflow_dispatch:
+  pull_request:
+    paths:
+      - .ci/docker/**
+      - .github/workflows/docker-builds.yml
+  push:
+    branches:
+      - main
+    paths:
+      - .ci/docker/**
+      - .github/workflows/docker-builds.yml
+
+jobs:
+  docker-build:
+    runs-on: [self-hosted, linux.2xlarge]
+    timeout-minutes: 240
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - docker-image-name: tutorials
+    env:
+      DOCKER_IMAGE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/tutorials/${{ matrix.docker-image-name }}
+    steps:
+      - name: Clean workspace
+        shell: bash
+        run: |
+          echo "${GITHUB_WORKSPACE}"
+          sudo rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+
+      - name: Setup SSH (Click me for login details)
+        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Checkout
+        uses: actions/checkout@v3
+
+      - name: Setup Linux
+        uses: pytorch/test-infra/.github/actions/setup-linux@main
+
+      - name: Build docker image
+        id: build-docker-image
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        with:
+          docker-image-name: ${{ matrix.docker-image-name }}
+          push: true
+
+      - name: Teardown Linux
+        uses: pytorch/test-infra/.github/actions/teardown-linux@main
+        if: always()
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
diff --git a/.github/workflows/link_checkPR.yml b/.github/workflows/link_checkPR.yml
new file mode 100644
index 00000000000..830e470c1c0
--- /dev/null
+++ b/.github/workflows/link_checkPR.yml
@@ -0,0 +1,57 @@
+#Checks links in a PR to ensure they are valid. If link is valid but failing, it can be added to the .lycheeignore file
+#Use the skip-link-check label on a PR to skip checking links on a PR
+
+name: link check on PR
+
+on:
+  pull_request:
+    branches: [main]
+
+jobs:
+  linkChecker:
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 1
+
+      - name: Get Changed Files
+        id: changed-files
+        uses: tj-actions/changed-files@d6e91a2266cdb9d62096cebf1e8546899c6aa18f # v45.0.6
+
+      - name: Check for Skip Label
+        id: skip-label
+        uses: actions/github-script@v6
+        with:
+          script: |
+            const labels = await github.rest.issues.listLabelsOnIssue({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              issue_number: context.issue.number
+            });
+            return labels.data.some(label => label.name === 'skip-link-check');
+
+      - name: Check Links
+        if: steps.skip-label.outputs.result == 'false'
+        uses: lycheeverse/lychee-action@v1
+        with:
+          args: --accept=200,403,429 --base . --verbose --no-progress ${{ steps.changed-files.outputs.all_changed_files }}
+          token: ${{ secrets.CUSTOM_TOKEN }}
+          fail: true
+
+      - name: Skip Message
+        if: steps.skip-label.outputs.result == 'true'
+        run: echo "Link check was skipped due to the presence of the 'skip-link-check' label."
+
+      # Per tj-actions, a delete file is not a changed file so this ensures lint checking does not occur on deleted files
+      - name: No Files to Check
+        if: steps.skip-label.outputs.result == 'false' && steps.changed-files.outputs.any_changed == 'true'
+        run: echo "No relevant files were changed in this PR that require link checking."
+
+      - name: Suggestions
+        if: failure()
+        run: |
+          echo -e "\nPlease review the links reported in the Check links step above."
+          echo -e "If a link is valid but fails due to a CAPTCHA challenge, IP blocking, login requirements, etc., consider adding such links to .lycheeignore file to bypass future checks.\n"
+          exit 1
diff --git a/.github/workflows/lintrunner.yml b/.github/workflows/lintrunner.yml
new file mode 100644
index 00000000000..e1a6889eb28
--- /dev/null
+++ b/.github/workflows/lintrunner.yml
@@ -0,0 +1,38 @@
+name: Lintrunner
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+jobs:
+  lintrunner:
+    name: lintrunner
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout tutorials
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+
+      - name: Setup Python
+        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
+        with:
+          python-version: '3.12'
+
+      - name: Install Lintrunner
+        run: |
+          pip install lintrunner==0.12.5
+          lintrunner init
+
+      - name: Run lintrunner on all files - Linux
+        run: |
+          set +e
+          if ! lintrunner -v --force-color --all-files --tee-json=lint.json; then
+              echo ""
+              echo -e "\e[1m\e[36mYou can reproduce these results locally by using \`lintrunner -m main\`.\e[0m"
+              exit 1
+          fi
diff --git a/.github/workflows/spelling.yml b/.github/workflows/spelling.yml
new file mode 100644
index 00000000000..e1cba836c96
--- /dev/null
+++ b/.github/workflows/spelling.yml
@@ -0,0 +1,153 @@
+name: Check spelling
+
+on:
+  pull_request:
+  push:
+    branches:
+      - main
+
+jobs:
+  pyspelling:
+    runs-on: ubuntu-20.04
+    steps:
+      - name: Check for skip label and get changed files
+        id: check-files
+        uses: actions/github-script@v6
+        with:
+          script: |
+            let skipCheck = false;
+            let changedFiles = [];
+
+            if (context.eventName === 'pull_request') {
+              // Check for skip label
+              const { data: labels } = await github.rest.issues.listLabelsOnIssue({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                issue_number: context.issue.number
+              });
+              skipCheck = labels.some(label => label.name === 'skip-spell-check');
+
+              if (!skipCheck) {
+                // Get changed files in PR
+                const { data: files } = await github.rest.pulls.listFiles({
+                  owner: context.repo.owner,
+                  repo: context.repo.repo,
+                  pull_number: context.issue.number
+                });
+
+                changedFiles = files
+                  .filter(file => file.filename.match(/\.(py|rst|md)$/))
+                  .map(file => file.filename);
+              }
+            } else {
+              // For push events, we'll still need to use git diff
+              // We'll handle this after checkout
+            }
+
+            core.setOutput('skip', skipCheck.toString());
+            core.setOutput('files', changedFiles.join('\n'));
+            core.setOutput('is-pr', (context.eventName === 'pull_request').toString());
+
+      - uses: actions/checkout@v4
+        if: steps.check-files.outputs.skip != 'true'
+        with:
+          fetch-depth: 0
+
+      - name: Get changed files for push event
+        if: |
+          steps.check-files.outputs.skip != 'true' &&
+          steps.check-files.outputs.is-pr != 'true'
+        id: push-files
+        run: |
+          CHANGED_FILES=$(git diff --name-only HEAD^..HEAD -- '*.py' '*.rst' '*.md')
+          echo "files<<EOF" >> $GITHUB_OUTPUT
+          echo "$CHANGED_FILES" >> $GITHUB_OUTPUT
+          echo "EOF" >> $GITHUB_OUTPUT
+
+      - name: Check if relevant files changed
+        if: steps.check-files.outputs.skip != 'true'
+        id: check
+        run: |
+          if [ "${{ steps.check-files.outputs.is-pr }}" == "true" ]; then
+            FILES="${{ steps.check-files.outputs.files }}"
+          else
+            FILES="${{ steps.push-files.outputs.files }}"
+          fi
+
+          if [ -z "$FILES" ]; then
+            echo "skip=true" >> $GITHUB_OUTPUT
+            echo "No relevant files changed (*.py, *.rst, *.md), skipping spell check"
+          else
+            echo "skip=false" >> $GITHUB_OUTPUT
+            echo "Found changed files to check:"
+            echo "$FILES"
+          fi
+
+      - uses: actions/setup-python@v4
+        if: |
+          steps.check-files.outputs.skip != 'true' &&
+          steps.check.outputs.skip != 'true'
+        with:
+          python-version: '3.9'
+          cache: 'pip'
+
+      - name: Install dependencies
+        if: |
+          steps.check-files.outputs.skip != 'true' &&
+          steps.check.outputs.skip != 'true'
+        run: |
+          pip install pyspelling
+          sudo apt-get install aspell aspell-en
+
+      - name: Run spell check on each file
+        id: spellcheck
+        if: |
+          steps.check-files.outputs.skip != 'true' &&
+          steps.check.outputs.skip != 'true'
+        run: |
+          if [ "${{ steps.check-files.outputs.is-pr }}" == "true" ]; then
+            mapfile -t FILES <<< "${{ steps.check-files.outputs.files }}"
+          else
+            mapfile -t FILES <<< "${{ steps.push-files.outputs.files }}"
+          fi
+
+          # Check each file individually
+          FINAL_EXIT_CODE=0
+          SPELLCHECK_LOG=""
+          for file in "${FILES[@]}"; do
+            if [ -n "$file" ]; then
+              echo "Checking spelling in $file"
+              python3 -c "import yaml; config = yaml.safe_load(open('.pyspelling.yml')); new_matrix = [matrix.copy() for matrix in config['matrix'] if (('python' in matrix['name'].lower() and '$file'.endswith('.py')) or ('rest' in matrix['name'].lower() and '$file'.endswith('.rst')) or ('markdown' in matrix['name'].lower() and '$file'.endswith('.md'))) and not matrix.update({'sources': ['$file']})]; config['matrix'] = new_matrix; yaml.dump(config, open('temp_config.yml', 'w'))"
+
+              if OUTPUT=$(pyspelling -c temp_config.yml 2>&1); then
+                echo "No spelling errors found in $file"
+              else
+                FINAL_EXIT_CODE=1
+                echo "Spelling errors found in $file:"
+                echo "$OUTPUT"
+                SPELLCHECK_LOG+="### $file\n$OUTPUT\n\n"
+              fi
+            fi
+          done
+
+          # Save the results to GITHUB_OUTPUT
+          echo "spell_failed=$FINAL_EXIT_CODE" >> $GITHUB_OUTPUT
+          echo "spell_log<<SPELLEOF" >> $GITHUB_OUTPUT
+          echo "$SPELLCHECK_LOG" >> $GITHUB_OUTPUT
+          echo "SPELLEOF" >> $GITHUB_OUTPUT
+
+          if [ $FINAL_EXIT_CODE -ne 0 ]; then
+            echo "Spell check failed! See above for details."
+            echo
+            echo "Here are a few tips:"
+            echo "- All PyTorch API objects must be in double backticks or use an intersphinx directive."
+            echo "  Example: ``torch.nn``, :func:"
+            echo "- Consult en-wordlist.txt for spellings of some of the words."
+            echo "  You can add a word to en-wordlist.txt if:"
+            echo "  1) It's a common abbreviation, like RNN."
+            echo "  2) It's a word widely accepted in the industry."
+            echo "- Please do not add words like 'dtype', 'torch.nn.Transformer' to pass spellcheck."
+            echo "  Instead wrap it in double backticks or use an intersphinx directive."
+            echo
+            exit 1
+          fi
diff --git a/.gitignore b/.gitignore
index 0b2f7966558..3f1f927ee33 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,12 +2,25 @@
 beginner
 intermediate
 advanced
+pytorch_basics
+/recipes
+prototype
+/unstable
+sg_execution_times.rst
 
 #data things
-beginner_source/hymenoptera_data
-intermediate_source/data/
+_data/
 advanced_source/images/
-*data.zip
+advanced_source/data/
+beginner_source/.data/
+beginner_source/data/
+beginner_source/blitz/data/
+beginner_source/faces/
+beginner_source/hybrid_frontend/data/
+beginner_source/hymenoptera_data/
+intermediate_source/data/
+*.zip
+MNIST/
 
 #builds
 _build/
@@ -22,6 +35,7 @@ __pycache__/
 *.so
 
 # Distribution / packaging
+src/
 .Python
 env/
 build/
@@ -89,7 +103,6 @@ target/
 
 # celery beat schedule file
 celerybeat-schedule
-
 # dotenv
 .env
 
@@ -102,3 +115,20 @@ ENV/
 
 # Rope project settings
 .ropeproject
+
+# Mac things
+.DS_Store
+cleanup.sh
+*.swp
+
+# PyTorch things
+*.pt
+
+# VSCode
+*.vscode
+
+# pyspelling
+dictionary.dic
+
+# linters
+/.lintbin
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/.jenkins/build.sh b/.jenkins/build.sh
new file mode 100755
index 00000000000..72d30655db8
--- /dev/null
+++ b/.jenkins/build.sh
@@ -0,0 +1,174 @@
+#!/bin/bash
+
+set -ex
+
+export BUCKET_NAME=pytorch-tutorial-build-pull-request
+
+# set locale for click dependency in spacy
+export LC_ALL=C.UTF-8
+export LANG=C.UTF-8
+
+DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )"
+
+# Update root certificates by installing new libgnutls30
+
+# Install pandoc (does not install from pypi)
+sudo apt-get update
+sudo apt-get install -y pandoc
+
+# NS: Path to python runtime should already be part of docker container
+# export PATH=/opt/conda/bin:$PATH
+
+#Install PyTorch Nightly for test.
+# Nightly - pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu102/torch_nightly.html
+# Install 2.5 to merge all 2.4 PRs - uncomment to install nightly binaries (update the version as needed).
+# sudo pip uninstall -y fbgemm-gpu torchrec
+# sudo pip uninstall -y torch torchvision torchaudio torchtext torchdata torchrl tensordict
+# sudo pip3 install fbgemm-gpu==1.1.0 torchrec==1.0.0 --no-cache-dir --index-url https://download.pytorch.org/whl/test/cu124
+# pip3 install torch==2.7.0 torchvision torchaudio --no-cache-dir --index-url https://download.pytorch.org/whl/test/cu126
+# Install two language tokenizers for Translation with TorchText tutorial
+pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.0/en_core_web_sm-3.4.0-py3-none-any.whl
+
+awsv2 -i
+awsv2 configure set default.s3.multipart_threshold 5120MB
+
+# Decide whether to parallelize tutorial builds, based on $JOB_BASE_NAME
+if [[ "${JOB_TYPE}" == "worker" ]]; then
+  # Step 1: Remove runnable code from tutorials that are not supposed to be run
+  python $DIR/remove_runnable_code.py beginner_source/aws_distributed_training_tutorial.py beginner_source/aws_distributed_training_tutorial.py || true
+  # Temp remove for mnist download issue. (Re-enabled for 1.8.1)
+  # python $DIR/remove_runnable_code.py beginner_source/fgsm_tutorial.py beginner_source/fgsm_tutorial.py || true
+  # python $DIR/remove_runnable_code.py intermediate_source/spatial_transformer_tutorial.py intermediate_source/spatial_transformer_tutorial.py || true
+  # Temp remove for 1.10 release.
+  # python $DIR/remove_runnable_code.py advanced_source/neural_style_tutorial.py advanced_source/neural_style_tutorial.py || true
+
+  # TODO: Fix bugs in these tutorials to make them runnable again
+  # python $DIR/remove_runnable_code.py beginner_source/audio_classifier_tutorial.py beginner_source/audio_classifier_tutorial.py || true
+
+  # Remove runnable code from tensorboard_profiler_tutorial.py as it frequently crashes, see https://github.com/pytorch/pytorch/issues/74139
+  # python $DIR/remove_runnable_code.py intermediate_source/tensorboard_profiler_tutorial.py intermediate_source/tensorboard_profiler_tutorial.py || true
+
+  # Step 2: Keep certain tutorials based on file count, and remove runnable code in all other tutorials
+  # IMPORTANT NOTE: We assume that each tutorial has a UNIQUE filename.
+  FILES_TO_RUN=$(python .jenkins/get_files_to_run.py)
+  echo "FILES_TO_RUN: " ${FILES_TO_RUN}
+  # Files to run must be accessible to subprocessed (at least to `download_data.py`)
+  export FILES_TO_RUN
+
+  # Step 3: Run `make docs` to generate HTML files and static files for these tutorialis
+  pip3 install -e git+https://github.com/pytorch/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme
+  make docs
+
+  # Step 3.1: Run the post-processing script:
+  python .jenkins/post_process_notebooks.py
+
+  # Step 4: If any of the generated files are not related the tutorial files we want to run,
+  # then we remove them
+  set +x
+  for filename in $(find docs/beginner docs/intermediate docs/advanced docs/recipes docs/prototype -name '*.html'); do
+    file_basename=$(basename $filename .html)
+    if [[ ! " ${FILES_TO_RUN} " =~ " ${file_basename} " ]]; then
+      echo "removing $filename"
+      rm $filename
+    fi
+  done
+  for filename in $(find docs/beginner docs/intermediate docs/advanced docs/recipes docs/prototype -name '*.rst'); do
+    file_basename=$(basename $filename .rst)
+    if [[ ! " ${FILES_TO_RUN} " =~ " ${file_basename} " ]]; then
+      echo "removing $filename"
+      rm $filename
+    fi
+  done
+  for filename in $(find docs/_downloads -name '*.py'); do
+    file_basename=$(basename $filename .py)
+    if [[ ! " ${FILES_TO_RUN} " =~ " ${file_basename} " ]]; then
+      echo "removing $filename"
+      rm $filename
+    fi
+  done
+  for filename in $(find docs/_downloads -name '*.ipynb'); do
+    file_basename=$(basename $filename .ipynb)
+    if [[ ! " ${FILES_TO_RUN} " =~ " ${file_basename} " ]]; then
+      echo "removing $filename"
+      rm $filename
+    fi
+  done
+  for filename in $(find docs/_sources/beginner docs/_sources/intermediate docs/_sources/advanced docs/_sources/recipes -name '*.rst.txt'); do
+    file_basename=$(basename $filename .rst.txt)
+    if [[ ! " ${FILES_TO_RUN} " =~ " ${file_basename} " ]]; then
+      echo "removing $filename"
+      rm $filename
+    fi
+  done
+  for filename in $(find docs/.doctrees/beginner docs/.doctrees/intermediate docs/.doctrees/advanced docs/.doctrees/recipes docs/.doctrees/prototype -name '*.doctree'); do
+    file_basename=$(basename $filename .doctree)
+    if [[ ! " ${FILES_TO_RUN} " =~ " ${file_basename} " ]]; then
+      echo "removing $filename"
+      rm $filename
+    fi
+  done
+  set -x
+
+  # Step 5: Remove INVISIBLE_CODE_BLOCK from .html/.rst.txt/.ipynb/.py files
+  bash $DIR/remove_invisible_code_block_batch.sh docs
+  python .jenkins/validate_tutorials_built.py
+
+  # Step 6: Copy generated files to S3, tag with commit ID
+  7z a worker_${WORKER_ID}.7z docs
+  awsv2 s3 cp worker_${WORKER_ID}.7z s3://${BUCKET_NAME}/${COMMIT_ID}/worker_${WORKER_ID}.7z
+elif [[ "${JOB_TYPE}" == "manager" ]]; then
+  # Step 1: Generate no-plot HTML pages for all tutorials
+  pip3 install -e git+https://github.com/pytorch/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme
+  make html-noplot
+  cp -r _build/html docs
+
+  # Step 2: Wait for all workers to finish
+  # Don't actually need to do this because gha will wait
+
+  # Step 3: Download generated with-plot HTML files and static files from S3, merge into one folder
+  mkdir -p docs_with_plot/docs
+  for ((worker_id=1;worker_id<NUM_WORKERS+1;worker_id++)); do
+    awsv2 s3 cp s3://${BUCKET_NAME}/${COMMIT_ID}/worker_$worker_id.7z worker_$worker_id.7z
+    7z x worker_$worker_id.7z -oworker_$worker_id
+    yes | cp -R worker_$worker_id/docs/* docs_with_plot/docs
+  done
+
+  # Step 4: Copy all generated files into docs
+  rsync -av docs_with_plot/docs/ docs --exclude='**aws_distributed_training_tutorial*'
+
+  # Step 5: Remove INVISIBLE_CODE_BLOCK from .html/.rst.txt/.ipynb/.py files
+  bash $DIR/remove_invisible_code_block_batch.sh docs
+  python .jenkins/validate_tutorials_built.py
+
+  # Step 5.1: Run post-processing script on .ipynb files:
+  python .jenkins/post_process_notebooks.py
+
+  # Step 6: Copy generated HTML files and static files to S3
+  7z a manager.7z docs
+  awsv2 s3 cp manager.7z s3://${BUCKET_NAME}/${COMMIT_ID}/manager.7z
+
+  # Step 7: push new HTML files and static files to gh-pages
+  if [[ "$COMMIT_SOURCE" == "refs/heads/master" || "$COMMIT_SOURCE" == "refs/heads/main" ]]; then
+    git clone https://github.com/pytorch/tutorials.git -b gh-pages gh-pages
+    # Clean up directories that contain tutorials
+
+    for dir in beginner intermediate prototype recipes advanced distributed vision text audio; do
+      rm -rf "gh-pages/$dir"
+    done
+
+    cp -r docs/* gh-pages/
+    pushd gh-pages
+    # DANGER! DO NOT REMOVE THE `set +x` SETTING HERE!
+    set +x
+    git remote set-url origin https://pytorchbot:${GITHUB_PYTORCHBOT_TOKEN}@github.com/pytorch/tutorials.git
+    set -x
+    git add -f -A || true
+    git config user.email "soumith+bot@pytorch.org"
+    git config user.name "pytorchbot"
+    git commit -m "Automated tutorials push" || true
+    git status
+    git push origin gh-pages
+  fi
+else
+  make docs
+fi
diff --git a/.jenkins/custom_pandoc_filter.py b/.jenkins/custom_pandoc_filter.py
new file mode 100644
index 00000000000..f4ceb0df11e
--- /dev/null
+++ b/.jenkins/custom_pandoc_filter.py
@@ -0,0 +1,139 @@
+from pandocfilters import toJSONFilter, Div, RawBlock, Para, Str, Space, Link, Code, CodeBlock
+import markdown
+import html
+
+def to_markdown(item, skip_octicon=False):
+    # A handler function to process strings, links, code, and code
+    # blocks
+    if item['t'] == 'Str':
+        return item['c']
+    elif item['t'] == 'Space':
+        return ' '
+    elif item['t'] == 'Link':
+        link_text = ''.join(to_markdown(i, skip_octicon) for i in item['c'][1])
+        return f'<a href="{item["c"][2][0]}">{link_text}</a>'
+    elif item['t'] == 'Code':
+        # Need to remove icticon as they don't render in .ipynb
+        if any(value == 'octicon' for key, value in item['c'][0][2]):
+            return ''
+        else:
+            # Escape the code and wrap it in <code> tags
+            return f'<code>{html.escape(item["c"][1])}</code>'
+    elif item['t'] == 'CodeBlock':
+        # Escape the code block and wrap it in <pre><code> tags
+        return f'<pre><code>{html.escape(item["c"][1])}</code></pre>'
+    else:
+        return ''
+
+
+def process_admonitions(key, value, format, meta):
+    # Replace admonitions with proper HTML.
+    if key == 'Div':
+        [[ident, classes, keyvals], contents] = value
+        if 'note' in classes:
+            color = '#54c7ec'
+            label = 'NOTE:'
+        elif 'tip' in classes:
+            color = '#6bcebb'
+            label = 'TIP:'
+        elif 'warning' in classes:
+            color = '#e94f3b'
+            label = 'WARNING:'
+        else:
+            return
+
+        note_content = []
+        for block in contents:
+            if block.get('t') == 'Para':
+                for item in block['c']:
+                    if item['t'] == 'Str':
+                        note_content.append(Str(item['c']))
+                    elif item['t'] == 'Space':
+                        note_content.append(Space())
+                    elif item['t'] == 'Link':
+                        note_content.append(Link(*item['c']))
+                    elif item['t'] == 'Code':
+                        note_content.append(Code(*item['c']))
+            elif block.get('t') == 'CodeBlock':
+                note_content.append(CodeBlock(*block['c']))
+
+        note_content_md = ''.join(to_markdown(item) for item in note_content)
+        html_content = markdown.markdown(note_content_md)
+
+        return [{'t': 'RawBlock', 'c': ['html', f'<div style="background-color: {color}; color: #fff; font-weight: 700; padding-left: 10px; padding-top: 5px; padding-bottom: 5px"><strong>{label}</strong></div>']}, {'t': 'RawBlock', 'c': ['html', '<div style="background-color: #f3f4f7; padding-left: 10px; padding-top: 10px; padding-bottom: 10px; padding-right: 10px">']}, {'t': 'RawBlock', 'c': ['html', html_content]}, {'t': 'RawBlock', 'c': ['html', '</div>']}]
+    elif key == 'RawBlock':
+    # this is needed for the cells that have embedded video.
+    # We add a special tag to those: ``` {python, .jupyter-code-cell}
+    # The post-processing script then finds those and genrates separate
+    # code cells that can load video.
+        [format, content] = value
+        if format == 'html' and 'iframe' in content:
+            # Extract the video URL
+            video_url = content.split('src="')[1].split('"')[0]
+            # Create the Python code to display the video
+            python_code = f"""
+from IPython.display import display, HTML
+html_code = \"""
+{content}
+\"""
+display(HTML(html_code))
+"""
+
+            return {'t': 'CodeBlock', 'c': [['', ['python', 'jupyter-code-cell'], []], python_code]}
+
+
+def process_images(key, value, format, meta):
+    # Add https://pytorch.org/tutorials/ to images so that they
+    # load correctly in the notebook.
+    if key != 'Image':
+        return None
+    [ident, classes, keyvals], caption, [src, title] = value
+    if not src.startswith('http'):
+        while src.startswith('../'):
+            src = src[3:]
+        if src.startswith('/_static'):
+            src = src[1:]
+        src = 'https://pytorch.org/tutorials/' + src
+
+    return {'t': 'Image', 'c': [[ident, classes, keyvals], caption, [src, title]]}
+
+
+def process_grids(key, value, format, meta):
+    # Generate side by side grid cards. Only for the two-cards layout
+    # that we use in the tutorial template.
+    if key == 'Div':
+        [[ident, classes, keyvals], contents] = value
+        if 'grid' in classes:
+            columns = ['<div style="width: 45%; float: left; padding: 20px;">',
+                       '<div style="width: 45%; float: right; padding: 20px;">']
+            column_num = 0
+            for block in contents:
+                if 't' in block and block['t'] == 'Div' and 'grid-item-card' in block['c'][0][1]:
+                    item_html = ''
+                    for item in block['c'][1]:
+                        if item['t'] == 'Para':
+                            item_html += '<h2>' + ''.join(to_markdown(i) for i in item['c']) + '</h2>'
+                        elif item['t'] == 'BulletList':
+                            item_html += '<ul>'
+                            for list_item in item['c']:
+                                item_html += '<li>' + ''.join(to_markdown(i) for i in list_item[0]['c']) + '</li>'
+                            item_html += '</ul>'
+                    columns[column_num] += item_html
+                    column_num = (column_num + 1) % 2
+            columns = [column + '</div>' for column in columns]
+            return {'t': 'RawBlock', 'c': ['html', ''.join(columns)]}
+
+def is_code_block(item):
+    return item['t'] == 'Code' and 'octicon' in item['c'][1]
+
+
+def process_all(key, value, format, meta):
+    for transform in [process_admonitions, process_images, process_grids]:
+        new_value = transform(key, value, format, meta)
+        if new_value is not None:
+            break
+    return new_value
+
+
+if __name__ == "__main__":
+    toJSONFilter(process_all)
diff --git a/.jenkins/delete_html_file_with_runnable_code_removed.py b/.jenkins/delete_html_file_with_runnable_code_removed.py
new file mode 100644
index 00000000000..b84a0ecd92e
--- /dev/null
+++ b/.jenkins/delete_html_file_with_runnable_code_removed.py
@@ -0,0 +1,11 @@
+import sys
+import os
+
+html_file_path = sys.argv[1]
+
+with open(html_file_path, 'r', encoding='utf-8') as html_file:
+    html = html_file.read()
+
+if "%%%%%%RUNNABLE_CODE_REMOVED%%%%%%" in html:
+    print("Removing " + html_file_path)
+    os.remove(html_file_path)
diff --git a/.jenkins/download_data.py b/.jenkins/download_data.py
new file mode 100644
index 00000000000..939e63fc7a8
--- /dev/null
+++ b/.jenkins/download_data.py
@@ -0,0 +1,136 @@
+#!/usr/bin/env python3
+import hashlib
+import os
+
+from typing import Optional
+from urllib.request import urlopen, Request
+from pathlib import Path
+from zipfile import ZipFile
+
+REPO_BASE_DIR = Path(__file__).absolute().parent.parent
+DATA_DIR = REPO_BASE_DIR / "_data"
+BEGINNER_DATA_DIR = REPO_BASE_DIR / "beginner_source" / "data"
+INTERMEDIATE_DATA_DIR = REPO_BASE_DIR / "intermediate_source" / "data"
+ADVANCED_DATA_DIR = REPO_BASE_DIR / "advanced_source" / "data"
+PROTOTYPE_DATA_DIR = REPO_BASE_DIR / "unstable_source" / "data"
+FILES_TO_RUN = os.getenv("FILES_TO_RUN")
+
+
+def size_fmt(nbytes: int) -> str:
+    """Returns a formatted file size string"""
+    KB = 1024
+    MB = 1024 * KB
+    GB = 1024 * MB
+    if abs(nbytes) >= GB:
+        return f"{nbytes * 1.0 / GB:.2f} Gb"
+    elif abs(nbytes) >= MB:
+        return f"{nbytes * 1.0 / MB:.2f} Mb"
+    elif abs(nbytes) >= KB:
+        return f"{nbytes * 1.0 / KB:.2f} Kb"
+    return str(nbytes) + " bytes"
+
+
+def download_url_to_file(url: str,
+                         dst: Optional[str] = None,
+                         prefix: Optional[Path] = None,
+                         sha256: Optional[str] = None) -> Path:
+    dst = dst if dst is not None else Path(url).name
+    dst = dst if prefix is None else str(prefix / dst)
+    if Path(dst).exists():
+        print(f"Skip downloading {url} as {dst} already exists")
+        return Path(dst)
+    file_size = None
+    u = urlopen(Request(url, headers={"User-Agent": "tutorials.downloader"}))
+    meta = u.info()
+    if hasattr(meta, 'getheaders'):
+        content_length = meta.getheaders("Content-Length")
+    else:
+        content_length = meta.get_all("Content-Length")
+    if content_length is not None and len(content_length) > 0:
+        file_size = int(content_length[0])
+    sha256_sum = hashlib.sha256()
+    with open(dst, "wb") as f:
+        while True:
+            buffer = u.read(32768)
+            if len(buffer) == 0:
+                break
+            sha256_sum.update(buffer)
+            f.write(buffer)
+    digest = sha256_sum.hexdigest()
+    if sha256 is not None and sha256 != digest:
+        Path(dst).unlink()
+        raise RuntimeError(f"Downloaded {url} has unexpected sha256sum {digest} should be {sha256}")
+    print(f"Downloaded {url} sha256sum={digest} size={size_fmt(file_size)}")
+    return Path(dst)
+
+
+def unzip(archive: Path, tgt_dir: Path) -> None:
+    with ZipFile(str(archive), "r") as zip_ref:
+        zip_ref.extractall(str(tgt_dir))
+
+
+def download_hymenoptera_data():
+    # transfer learning tutorial data
+    z = download_url_to_file("https://download.pytorch.org/tutorial/hymenoptera_data.zip",
+                             prefix=DATA_DIR,
+                             sha256="fbc41b31d544714d18dd1230b1e2b455e1557766e13e67f9f5a7a23af7c02209",
+                             )
+    unzip(z, BEGINNER_DATA_DIR)
+
+
+def download_nlp_data() -> None:
+    # nlp tutorial data
+    z = download_url_to_file("https://download.pytorch.org/tutorial/data.zip",
+                             prefix=DATA_DIR,
+                             sha256="fb317e80248faeb62dc25ef3390ae24ca34b94e276bbc5141fd8862c2200bff5",
+                             )
+    # This will unzip all files in data.zip to intermediate_source/data/ folder
+    unzip(z, INTERMEDIATE_DATA_DIR.parent)
+
+
+def download_dcgan_data() -> None:
+    # Download dataset for beginner_source/dcgan_faces_tutorial.py
+    z = download_url_to_file("https://s3.amazonaws.com/pytorch-tutorial-assets/img_align_celeba.zip",
+                             prefix=DATA_DIR,
+                             sha256="46fb89443c578308acf364d7d379fe1b9efb793042c0af734b6112e4fd3a8c74",
+                             )
+    unzip(z, BEGINNER_DATA_DIR / "celeba")
+
+
+def download_lenet_mnist() -> None:
+    # Download model for beginner_source/fgsm_tutorial.py
+    download_url_to_file("https://docs.google.com/uc?export=download&id=1HJV2nUHJqclXQ8flKvcWmjZ-OU5DGatl",
+                         prefix=BEGINNER_DATA_DIR,
+                         dst="lenet_mnist_model.pth",
+                         sha256="cb5f8e578aef96d5c1a2cc5695e1aa9bbf4d0fe00d25760eeebaaac6ebc2edcb",
+                         )
+
+def download_gpu_quantization_torchao() -> None:
+    # Download SAM model checkpoint unstable_source/gpu_quantization_torchao_tutorial.py
+    download_url_to_file("https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth",
+                         prefix=PROTOTYPE_DATA_DIR,
+                         dst="sam_vit_h_4b8939.pth",
+                         sha256="a7bf3b02f3ebf1267aba913ff637d9a2d5c33d3173bb679e46d9f338c26f262e",
+                         )
+
+def main() -> None:
+    DATA_DIR.mkdir(exist_ok=True)
+    BEGINNER_DATA_DIR.mkdir(exist_ok=True)
+    ADVANCED_DATA_DIR.mkdir(exist_ok=True)
+    INTERMEDIATE_DATA_DIR.mkdir(exist_ok=True)
+    PROTOTYPE_DATA_DIR.mkdir(exist_ok=True)
+
+    if FILES_TO_RUN is None or "transfer_learning_tutorial" in FILES_TO_RUN:
+        download_hymenoptera_data()
+    nlp_tutorials = ["seq2seq_translation_tutorial", "char_rnn_classification_tutorial", "char_rnn_generation_tutorial"]
+    if FILES_TO_RUN is None or any(x in FILES_TO_RUN for x in nlp_tutorials):
+        download_nlp_data()
+    if FILES_TO_RUN is None or "dcgan_faces_tutorial" in FILES_TO_RUN:
+        download_dcgan_data()
+    if FILES_TO_RUN is None or "fgsm_tutorial" in FILES_TO_RUN:
+        download_lenet_mnist()
+    if FILES_TO_RUN is None or "gpu_quantization_torchao_tutorial" in FILES_TO_RUN:
+        download_gpu_quantization_torchao()
+
+if __name__ == "__main__":
+    main()
diff --git a/.jenkins/get_docker_tag.py b/.jenkins/get_docker_tag.py
new file mode 100644
index 00000000000..21c4a8f7089
--- /dev/null
+++ b/.jenkins/get_docker_tag.py
@@ -0,0 +1,18 @@
+import requests
+
+REQUEST_HEADERS = {
+    "Accept": "application/vnd.github.v3+json",
+}
+
+if __name__ == "__main__":
+    url = "https://api.github.com/repos/pytorch/pytorch/contents/.ci"
+
+    response = requests.get(url, headers=REQUEST_HEADERS)
+    docker_sha = None
+    for finfo in response.json():
+        if finfo["name"] == "docker":
+            docker_sha = finfo["sha"]
+            break
+    if docker_sha is None:
+        raise RuntimeError("Can't find sha sum of docker folder")
+    print(docker_sha)
diff --git a/.jenkins/get_files_to_run.py b/.jenkins/get_files_to_run.py
new file mode 100644
index 00000000000..bdf4562a827
--- /dev/null
+++ b/.jenkins/get_files_to_run.py
@@ -0,0 +1,106 @@
+from typing import Any, Dict, List, Optional, Tuple
+import json
+import os
+from pathlib import Path
+from remove_runnable_code import remove_runnable_code
+
+
+# Calculate repo base dir
+REPO_BASE_DIR = Path(__file__).absolute().parent.parent
+
+
+def get_all_files() -> List[str]:
+    sources = [x.relative_to(REPO_BASE_DIR) for x in REPO_BASE_DIR.glob("*_source/**/*.py") if 'data' not in x.parts]
+    return sorted([str(x) for x in sources])
+
+
+def read_metadata() -> Dict[str, Any]:
+    with (REPO_BASE_DIR / ".jenkins" / "metadata.json").open() as fp:
+        return json.load(fp)
+
+
+def calculate_shards(all_files: List[str], num_shards: int = 20) -> List[List[str]]:
+    sharded_files: List[Tuple[float, List[str]]] = [(0.0, []) for _ in range(num_shards)]
+    metadata = read_metadata()
+
+    def get_duration(file: str) -> int:
+        # tutorials not listed in the metadata.json file usually take
+        # <3min to run, so we'll default to 1min if it's not listed
+        return metadata.get(file, {}).get("duration", 60)
+
+    def get_needs_machine(file: str) -> Optional[str]:
+        return metadata.get(file, {}).get("needs", None)
+
+    def add_to_shard(i, filename):
+        shard_time, shard_jobs = sharded_files[i]
+        shard_jobs.append(filename)
+        sharded_files[i] = (
+            shard_time + get_duration(filename),
+            shard_jobs,
+        )
+
+    all_other_files = all_files.copy()
+    needs_multigpu = list(
+        filter(lambda x: get_needs_machine(x) == "linux.16xlarge.nvidia.gpu", all_files,)
+    )
+    needs_a10g = list(
+        filter(lambda x: get_needs_machine(x) == "linux.g5.4xlarge.nvidia.gpu", all_files,)
+    )
+    for filename in needs_multigpu:
+        # currently, the only job that has multigpu is the 0th worker,
+        # so we'll add all the jobs that need this machine to the 0th worker
+        add_to_shard(0, filename)
+        all_other_files.remove(filename)
+    for filename in needs_a10g:
+        # currently, workers 1-5 use linux.g5.4xlarge.nvidia.gpu (sm86, A10G),
+        # so we'll add all the jobs that need this machine to the 1st worker
+        add_to_shard(1, filename)
+        all_other_files.remove(filename)
+    sorted_files = sorted(all_other_files, key=get_duration, reverse=True,)
+
+    for filename in sorted_files:
+        min_shard_index = sorted(range(1, num_shards), key=lambda i: sharded_files[i][0])[
+            0
+        ]
+        add_to_shard(min_shard_index, filename)
+    return [x[1] for x in sharded_files]
+
+
+def compute_files_to_keep(files_to_run: List[str]) -> List[str]:
+    metadata = read_metadata()
+    files_to_keep = list(files_to_run)
+    for file in files_to_run:
+        extra_files = metadata.get(file, {}).get("extra_files", [])
+        files_to_keep.extend(extra_files)
+    return files_to_keep
+
+
+def remove_other_files(all_files, files_to_keep) -> None:
+
+    for file in all_files:
+        if file not in files_to_keep:
+            remove_runnable_code(file, file)
+
+
+def parse_args() -> Any:
+    from argparse import ArgumentParser
+    parser = ArgumentParser("Select files to run")
+    parser.add_argument("--dry-run", action="store_true")
+    parser.add_argument("--num-shards", type=int, default=int(os.environ.get("NUM_WORKERS", "20")))
+    parser.add_argument("--shard-num", type=int, default=int(os.environ.get("WORKER_ID", "1")))
+    return parser.parse_args()
+
+
+def main() -> None:
+    args = parse_args()
+
+    all_files = get_all_files()
+    files_to_run = calculate_shards(all_files, num_shards=args.num_shards)[args.shard_num - 1]
+    if not args.dry_run:
+        remove_other_files(all_files, compute_files_to_keep(files_to_run))
+    stripped_file_names = [Path(x).stem for x in files_to_run]
+    print(" ".join(stripped_file_names))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.jenkins/get_sphinx_filenames.py b/.jenkins/get_sphinx_filenames.py
new file mode 100644
index 00000000000..b84267b48a3
--- /dev/null
+++ b/.jenkins/get_sphinx_filenames.py
@@ -0,0 +1,13 @@
+from pathlib import Path
+from typing import List
+
+from get_files_to_run import get_all_files
+from validate_tutorials_built import NOT_RUN
+
+
+def get_files_for_sphinx() -> List[str]:
+    all_py_files = get_all_files()
+    return [x for x in all_py_files if all(y not in x for y in NOT_RUN)]
+
+
+SPHINX_SHOULD_RUN = "|".join(get_files_for_sphinx())
diff --git a/.jenkins/insert_last_verified.py b/.jenkins/insert_last_verified.py
new file mode 100644
index 00000000000..b43ef8de8e8
--- /dev/null
+++ b/.jenkins/insert_last_verified.py
@@ -0,0 +1,160 @@
+import json
+import os
+import subprocess
+import sys
+from datetime import datetime
+
+from bs4 import BeautifulSoup
+
+
+json_file_path = "tutorials-review-data.json"
+
+# paths to skip from the post-processing script
+paths_to_skip = [
+    "beginner/examples_autograd/two_layer_net_custom_function",  # not present in the repo
+    "beginner/examples_nn/two_layer_net_module",  # not present in the repo
+    "beginner/examples_tensor/two_layer_net_numpy",  # not present in the repo
+    "beginner/examples_tensor/two_layer_net_tensor",  # not present in the repo
+    "beginner/examples_autograd/two_layer_net_autograd",  # not present in the repo
+    "beginner/examples_nn/two_layer_net_optim",  # not present in the repo
+    "beginner/examples_nn/two_layer_net_nn",  # not present in the repo
+    "intermediate/coding_ddpg",  # not present in the repo - will delete the carryover
+]
+# Mapping of source directories to build directories
+source_to_build_mapping = {
+    "beginner": "beginner_source",
+    "recipes": "recipes_source",
+    "distributed": "distributed",
+    "intermediate": "intermediate_source",
+    "prototype": "prototype_source",
+    "advanced": "advanced_source",
+    "": "",  # root dir for index.rst
+}
+
+def get_git_log_date(file_path, git_log_args):
+    try:
+        result = subprocess.run(
+            ["git", "log"] + git_log_args + ["--", file_path],
+            capture_output=True,
+            text=True,
+            check=True,
+        )
+        if result.stdout:
+            date_str = result.stdout.splitlines()[0]
+            return datetime.strptime(date_str, "%a, %d %b %Y %H:%M:%S %z")
+    except subprocess.CalledProcessError:
+        pass
+    raise ValueError(f"Could not find date for {file_path}")
+
+def get_creation_date(file_path):
+    return get_git_log_date(file_path, ["--diff-filter=A", "--format=%aD"]).strftime("%b %d, %Y")
+
+
+def get_last_updated_date(file_path):
+    return get_git_log_date(file_path, ["-1", "--format=%aD"]).strftime("%b %d, %Y")
+
+# Try to find the source file with the given base path and the extensions .rst and .py
+def find_source_file(base_path):
+    for ext in [".rst", ".py"]:
+        source_file_path = base_path + ext
+        if os.path.exists(source_file_path):
+            return source_file_path
+    return None
+
+
+# Function to process a JSON file and insert the "Last Verified" information into the HTML files
+def process_json_file(build_dir , json_file_path):
+    with open(json_file_path, "r", encoding="utf-8") as json_file:
+        json_data = json.load(json_file)
+
+    for entry in json_data:
+        path = entry["Path"]
+        last_verified = entry["Last Verified"]
+        status = entry.get("Status", "")
+        if path in paths_to_skip:
+            print(f"Skipping path: {path}")
+            continue
+        if status in ["needs update", "not verified"]:
+            formatted_last_verified = "Not Verified"
+        elif last_verified:
+            try:
+                last_verified_date = datetime.strptime(last_verified, "%Y-%m-%d")
+                formatted_last_verified = last_verified_date.strftime("%b %d, %Y")
+            except ValueError:
+                formatted_last_verified = "Unknown"
+        else:
+            formatted_last_verified = "Not Verified"
+        if status == "deprecated":
+            formatted_last_verified += "Deprecated"
+
+        for build_subdir, source_subdir in source_to_build_mapping.items():
+            if path.startswith(build_subdir):
+                html_file_path = os.path.join(build_dir, path + ".html")
+                base_source_path = os.path.join(
+                    source_subdir, path[len(build_subdir) + 1 :]
+                )
+                source_file_path = find_source_file(base_source_path)
+                break
+        else:
+            print(f"Warning: No mapping found for path {path}")
+            continue
+
+        if not os.path.exists(html_file_path):
+            print(
+                f"Warning: HTML file not found for path {html_file_path}."
+                "If this is a new tutorial, please add it to the audit JSON file and set the Verified status and todays's date."
+            )
+            continue
+
+        if not source_file_path:
+            print(f"Warning: Source file not found for path {base_source_path}.")
+            continue
+
+        created_on = get_creation_date(source_file_path)
+        last_updated = get_last_updated_date(source_file_path)
+
+        with open(html_file_path, "r", encoding="utf-8") as file:
+            soup = BeautifulSoup(file, "html.parser")
+        # Check if the <p> tag with class "date-info-last-verified" already exists
+        existing_date_info = soup.find("p", {"class": "date-info-last-verified"})
+        if existing_date_info:
+            print(
+                f"Warning: <p> tag with class 'date-info-last-verified' already exists in {html_file_path}"
+            )
+            continue
+
+        h1_tag = soup.find("h1")  # Find the h1 tag to insert the dates
+        if h1_tag:
+            date_info_tag = soup.new_tag("p", **{"class": "date-info-last-verified"})
+            date_info_tag["style"] = "color: #6c6c6d; font-size: small;"
+            # Add the "Created On", "Last Updated", and "Last Verified" information
+            date_info_tag.string = (
+                f"Created On: {created_on} | "
+                f"Last Updated: {last_updated} | "
+                f"Last Verified: {formatted_last_verified}"
+            )
+            # Insert the new tag after the <h1> tag
+            h1_tag.insert_after(date_info_tag)
+            # Save back to the HTML.
+            with open(html_file_path, "w", encoding="utf-8") as file:
+                file.write(str(soup))
+        else:
+            print(f"Warning: <h1> tag not found in {html_file_path}")
+
+
+def main():
+    if len(sys.argv) < 2:
+        print("Error: Build directory not provided. Exiting.")
+        exit(1)
+    build_dir = sys.argv[1]
+    print(f"Build directory: {build_dir}")
+    process_json_file(build_dir , json_file_path)
+    print(
+        "Finished processing JSON file. Please check the output for any warnings. "
+        "Pages like `nlp/index.html` are generated only during the full `make docs` "
+        "or `make html` build. Warnings about these files when you run `make html-noplot` "
+        "can be ignored."
+    )
+
+if __name__ == "__main__":
+    main()
diff --git a/.jenkins/metadata.json b/.jenkins/metadata.json
new file mode 100644
index 00000000000..6e82d054b4e
--- /dev/null
+++ b/.jenkins/metadata.json
@@ -0,0 +1,76 @@
+{
+  "intermediate_source/ax_multiobjective_nas_tutorial.py": {
+    "extra_files": ["intermediate_source/mnist_train_nas.py"],
+    "duration": 2000
+  },
+  "beginner_source/dcgan_faces_tutorial.py": {
+    "duration": 2000
+  },
+  "intermediate_source/seq2seq_translation_tutorial.py": {
+    "duration": 1200
+  },
+  "beginner_source/hyperparameter_tuning_tutorial.py": {
+    "duration": 0
+  },
+  "advanced_source/dynamic_quantization_tutorial.py": {
+    "duration": 380
+  },
+  "beginner_source/chatbot_tutorial.py": {
+    "duration": 330
+  },
+  "intermediate_source/pipeline_tutorial.py": {
+    "duration": 320,
+    "needs": "linux.16xlarge.nvidia.gpu"
+  },
+  "beginner_source/blitz/data_parallel_tutorial.py": {
+    "needs": "linux.16xlarge.nvidia.gpu"
+  },
+  "intermediate_source/model_parallel_tutorial.py": {
+    "needs": "linux.16xlarge.nvidia.gpu"
+  },
+  "intermediate_source/torchrec_intro_tutorial.py": {
+    "needs": "linux.g5.4xlarge.nvidia.gpu"
+  },
+  "recipes_source/torch_export_aoti_python.py": {
+    "needs": "linux.g5.4xlarge.nvidia.gpu"
+  },
+  "advanced_source/pendulum.py": {
+    "needs": "linux.g5.4xlarge.nvidia.gpu",
+    "_comment": "need to be here for the compiling_optimizer_lr_scheduler.py to run."
+  },
+  "intermediate_source/torchvision_tutorial.py": {
+    "needs": "linux.g5.4xlarge.nvidia.gpu",
+    "_comment": "does not require a5g but needs to run before gpu_quantization_torchao_tutorial.py."
+  },
+  "advanced_source/coding_ddpg.py": {
+     "needs": "linux.g5.4xlarge.nvidia.gpu",
+     "_comment": "does not require a5g but needs to run before gpu_quantization_torchao_tutorial.py."
+  },
+  "recipes_source/compiling_optimizer_lr_scheduler.py": {
+     "needs": "linux.g5.4xlarge.nvidia.gpu"
+  },
+  "intermediate_source/torch_compile_tutorial.py": {
+    "needs": "linux.g5.4xlarge.nvidia.gpu"
+  },
+  "intermediate_source/torch_export_tutorial.py": {
+    "needs": "linux.g5.4xlarge.nvidia.gpu"
+  },
+  "intermediate_source/scaled_dot_product_attention_tutorial.py": {
+    "needs": "linux.g5.4xlarge.nvidia.gpu"
+  },
+  "intermediate_source/transformer_building_blocks.py": {
+    "needs": "linux.g5.4xlarge.nvidia.gpu"
+  },
+  "recipes_source/torch_compile_user_defined_triton_kernel_tutorial.py": {
+    "needs": "linux.g5.4xlarge.nvidia.gpu"
+  },
+  "recipes_source/regional_compilation.py": {
+    "needs": "linux.g5.4xlarge.nvidia.gpu"
+  },
+  "advanced_source/semi_structured_sparse.py": {
+    "needs": "linux.g5.4xlarge.nvidia.gpu"
+  },
+  "prototype_source/gpu_quantization_torchao_tutorial.py": {
+    "needs": "linux.g5.4xlarge.nvidia.gpu"
+  }
+}
diff --git a/.jenkins/post_process_notebooks.py b/.jenkins/post_process_notebooks.py
new file mode 100644
index 00000000000..d10eb5a1bcc
--- /dev/null
+++ b/.jenkins/post_process_notebooks.py
@@ -0,0 +1,97 @@
+import nbformat as nbf
+import os
+import re
+
+"""
+This post-processing script needs to run after the .ipynb files are
+generated. The script removes extraneous ```{=html} syntax from the
+admonitions and splits the cells that have video iframe into a
+separate code cell that can be run to load the video directly
+in the notebook. This script is included in build.sh.
+"""
+
+
+# Pattern to search ``` {.python .jupyter-code-cell}
+pattern = re.compile(r'(.*?)``` {\.python \.jupyter-code-cell}\n(.*?from IPython\.display import display, HTML.*?display\(HTML\(html_code\)\))\n```(.*)', re.DOTALL)
+
+
+def process_video_cell(notebook_path):
+    """
+    This function finds the code blocks with the
+    "``` {.python .jupyter-code-cell}" code bocks and slices them
+    into a separe code cell (instead of markdown) which allows to
+    load the video in the notebook. The rest of the content is placed
+    in a new markdown cell.
+    """
+    print(f'Processing file: {notebook_path}')
+    notebook = nbf.read(notebook_path, as_version=4)
+
+    # Iterate over markdown cells
+    for i, cell in enumerate(notebook.cells):
+        if cell.cell_type == 'markdown':
+            match = pattern.search(cell.source)
+            if match:
+                print(f'Match found in cell {i}: {match.group(0)[:100]}...')
+                # Extract the parts before and after the video code block
+                before_html_block = match.group(1)
+                code_block = match.group(2)
+
+                # Add a comment to run the cell to display the video
+                code_block = "# Run this cell to load the video\n" + code_block
+                # Create a new code cell
+                new_code_cell = nbf.v4.new_code_cell(source=code_block)
+
+                # Replace the original markdown cell with the part before the code block
+                cell.source = before_html_block
+
+                # Insert the new code cell after the current one
+                notebook.cells.insert(i+1, new_code_cell)
+                print(f'New code cell created with source: {new_code_cell.source}')
+
+                # If there is content after the HTML code block, create a new markdown cell
+                if len(match.group(3).strip()) > 0:
+                    after_html_block = match.group(3)
+                    new_markdown_cell = nbf.v4.new_markdown_cell(source=after_html_block)
+                    # Create a new markdown cell and add the content after code block there
+                    notebook.cells.insert(i+2, new_markdown_cell)
+
+            else:
+                # Remove ```{=html} from the code block
+                cell.source = remove_html_tag(cell.source)
+
+    nbf.write(notebook, notebook_path)
+
+
+def remove_html_tag(content):
+    """
+    Pandoc adds an extraneous ```{=html} ``` to raw HTML blocks which
+    prevents it from rendering correctly. This function removes
+    ```{=html} that we don't need.
+    """
+    content = re.sub(r'```{=html}\n<div', '<div', content)
+    content = re.sub(r'">\n```', '">', content)
+    content = re.sub(r'<\/div>\n```', '</div>\n', content)
+    content = re.sub(r'```{=html}\n</div>\n```', '</div>\n', content)
+    content = re.sub(r'```{=html}', '', content)
+    content = re.sub(r'</p>\n```', '</p>', content)
+    return content
+
+
+def walk_dir(downloads_dir):
+    """
+    Walk the dir and process all notebook files in
+    the _downloads directory and its subdirectories.
+    """
+    for root, dirs, files in os.walk(downloads_dir):
+        for filename in files:
+            if filename.endswith('.ipynb'):
+                process_video_cell(os.path.join(root, filename))
+
+
+def main():
+    downloads_dir = './docs/_downloads'
+    walk_dir(downloads_dir)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.jenkins/remove_invisible_code_block_batch.sh b/.jenkins/remove_invisible_code_block_batch.sh
new file mode 100644
index 00000000000..5de9698de0f
--- /dev/null
+++ b/.jenkins/remove_invisible_code_block_batch.sh
@@ -0,0 +1,21 @@
+BUILDDIR=$1
+
+DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )"
+
+# Remove INVISIBLE_CODE_BLOCK from .html/.rst/.rst.txt/.ipynb/.py files
+for filename in $(find $BUILDDIR/beginner $BUILDDIR/intermediate $BUILDDIR/advanced -name '*.html'); do
+    echo "Removing INVISIBLE_CODE_BLOCK from " $filename
+    python $DIR/remove_invisible_code_block_from_html.py $filename $filename
+done
+for filename in $(find $BUILDDIR/_sources/beginner $BUILDDIR/_sources/intermediate $BUILDDIR/_sources/advanced -name '*.rst.txt'); do
+    echo "Removing INVISIBLE_CODE_BLOCK from " $filename
+    python $DIR/remove_invisible_code_block_from_rst_txt.py $filename $filename
+done
+for filename in $(find $BUILDDIR/_downloads -name '*.ipynb'); do
+    echo "Removing INVISIBLE_CODE_BLOCK from " $filename
+    python $DIR/remove_invisible_code_block_from_ipynb.py $filename $filename
+done
+for filename in $(find $BUILDDIR/_downloads -name '*.py'); do
+    echo "Removing INVISIBLE_CODE_BLOCK from " $filename
+    python $DIR/remove_invisible_code_block_from_py.py $filename $filename
+done
diff --git a/.jenkins/remove_invisible_code_block_from_html.py b/.jenkins/remove_invisible_code_block_from_html.py
new file mode 100644
index 00000000000..827b9802d91
--- /dev/null
+++ b/.jenkins/remove_invisible_code_block_from_html.py
@@ -0,0 +1,17 @@
+import sys
+from bs4 import BeautifulSoup
+
+html_file_path = sys.argv[1]
+output_file_path = sys.argv[2]
+
+with open(html_file_path, 'r', encoding='utf-8') as html_file:
+    html = html_file.read()
+html_soup = BeautifulSoup(html, 'html.parser')
+
+elems = html_soup.find_all("div", {"class": "highlight-default"})
+for elem in elems:
+    if "%%%%%%INVISIBLE_CODE_BLOCK%%%%%%" in str(elem):
+        elem.decompose()
+
+with open(output_file_path, "w", encoding='utf-8') as output_file:
+    output_file.write(str(html_soup))
diff --git a/.jenkins/remove_invisible_code_block_from_ipynb.py b/.jenkins/remove_invisible_code_block_from_ipynb.py
new file mode 100644
index 00000000000..69913efb050
--- /dev/null
+++ b/.jenkins/remove_invisible_code_block_from_ipynb.py
@@ -0,0 +1,18 @@
+import sys
+from bs4 import BeautifulSoup
+
+ipynb_file_path = sys.argv[1]
+output_file_path = sys.argv[2]
+
+with open(ipynb_file_path, 'r', encoding='utf-8') as ipynb_file:
+    ipynb_lines = ipynb_file.readlines()
+
+ipynb_out_lines = []
+
+for line in ipynb_lines:
+    if not '%%%%%%INVISIBLE_CODE_BLOCK%%%%%%' in line:
+        ipynb_out_lines.append(line)
+
+with open(output_file_path, "w", encoding='utf-8') as output_file:
+    for line in ipynb_out_lines:
+        output_file.write(line)
diff --git a/.jenkins/remove_invisible_code_block_from_py.py b/.jenkins/remove_invisible_code_block_from_py.py
new file mode 100644
index 00000000000..d39e5f4bf98
--- /dev/null
+++ b/.jenkins/remove_invisible_code_block_from_py.py
@@ -0,0 +1,25 @@
+import sys
+from bs4 import BeautifulSoup
+
+py_file_path = sys.argv[1]
+output_file_path = sys.argv[2]
+
+with open(py_file_path, 'r', encoding='utf-8') as py_file:
+    py_lines = py_file.readlines()
+
+py_out_lines = []
+
+in_invisible_block = False
+for line in py_lines:
+    if not in_invisible_block:
+        if '%%%%%%INVISIBLE_CODE_BLOCK%%%%%%' in line:
+            in_invisible_block = True
+        else:
+            py_out_lines.append(line)
+    else:
+        if '%%%%%%INVISIBLE_CODE_BLOCK%%%%%%' in line:
+            in_invisible_block = False
+
+with open(output_file_path, "w", encoding='utf-8') as output_file:
+    for line in py_out_lines:
+        output_file.write(line)
diff --git a/.jenkins/remove_invisible_code_block_from_rst_txt.py b/.jenkins/remove_invisible_code_block_from_rst_txt.py
new file mode 100644
index 00000000000..e6eb648e754
--- /dev/null
+++ b/.jenkins/remove_invisible_code_block_from_rst_txt.py
@@ -0,0 +1,19 @@
+import sys
+from bs4 import BeautifulSoup
+
+rst_txt_file_path = sys.argv[1]
+output_file_path = sys.argv[2]
+
+with open(rst_txt_file_path, 'r', encoding='utf-8') as rst_txt_file:
+    rst_txt = rst_txt_file.read()
+
+splits = rst_txt.split('.. code-block:: default\n\n\n    # %%%%%%INVISIBLE_CODE_BLOCK%%%%%%\n')
+if len(splits) == 2:
+    code_before_invisible_block = splits[0]
+    code_after_invisible_block = splits[1].split('    # %%%%%%INVISIBLE_CODE_BLOCK%%%%%%\n')[1]
+    rst_txt_out = code_before_invisible_block + code_after_invisible_block
+else:
+    rst_txt_out = rst_txt
+
+with open(output_file_path, "w", encoding='utf-8') as output_file:
+    output_file.write(rst_txt_out)
diff --git a/.jenkins/remove_runnable_code.py b/.jenkins/remove_runnable_code.py
new file mode 100644
index 00000000000..037017d8d76
--- /dev/null
+++ b/.jenkins/remove_runnable_code.py
@@ -0,0 +1,58 @@
+import sys
+
+STATE_IN_MULTILINE_COMMENT_BLOCK_DOUBLE_QUOTE = "STATE_IN_MULTILINE_COMMENT_BLOCK_DOUBLE_QUOTE"
+STATE_IN_MULTILINE_COMMENT_BLOCK_SINGLE_QUOTE = "STATE_IN_MULTILINE_COMMENT_BLOCK_SINGLE_QUOTE"
+STATE_NORMAL = "STATE_NORMAL"
+
+
+def remove_runnable_code(python_file_path, output_file_path):
+    with open(python_file_path, 'r', encoding='utf-8') as file:
+        lines = file.readlines()
+        ret_lines = []
+        state = STATE_NORMAL
+        for line in lines:
+            if state == STATE_NORMAL:
+                if line.startswith('#'):
+                    ret_lines.append(line)
+                    state = STATE_NORMAL
+                elif ((line.startswith('"""') or line.startswith('r"""')) and
+                        line.endswith('"""')):
+                    ret_lines.append(line)
+                    state = STATE_NORMAL
+                elif line.startswith('"""') or line.startswith('r"""'):
+                    ret_lines.append(line)
+                    state = STATE_IN_MULTILINE_COMMENT_BLOCK_DOUBLE_QUOTE
+                elif ((line.startswith("'''") or line.startswith("r'''")) and
+                        line.endswith("'''")):
+                    ret_lines.append(line)
+                    state = STATE_NORMAL
+                elif line.startswith("'''") or line.startswith("r'''"):
+                    ret_lines.append(line)
+                    state = STATE_IN_MULTILINE_COMMENT_BLOCK_SINGLE_QUOTE
+                else:
+                    ret_lines.append("\n")
+                    state = STATE_NORMAL
+            elif state == STATE_IN_MULTILINE_COMMENT_BLOCK_DOUBLE_QUOTE:
+                if line.startswith('"""'):
+                    ret_lines.append(line)
+                    state = STATE_NORMAL
+                else:
+                    ret_lines.append(line)
+                    state = STATE_IN_MULTILINE_COMMENT_BLOCK_DOUBLE_QUOTE
+            elif state == STATE_IN_MULTILINE_COMMENT_BLOCK_SINGLE_QUOTE:
+                if line.startswith("'''"):
+                    ret_lines.append(line)
+                    state = STATE_NORMAL
+                else:
+                    ret_lines.append(line)
+                    state = STATE_IN_MULTILINE_COMMENT_BLOCK_SINGLE_QUOTE
+
+    ret_lines.append("\n# %%%%%%RUNNABLE_CODE_REMOVED%%%%%%")
+
+    with open(output_file_path, 'w', encoding='utf-8') as file:
+        for line in ret_lines:
+            file.write(line)
+
+
+if __name__ == "__main__":
+    remove_runnable_code(sys.argv[1], sys.argv[2])
diff --git a/.jenkins/replace_tutorial_html_content.py b/.jenkins/replace_tutorial_html_content.py
new file mode 100644
index 00000000000..587464cd99e
--- /dev/null
+++ b/.jenkins/replace_tutorial_html_content.py
@@ -0,0 +1,24 @@
+import sys
+
+noplot_html_file_path = sys.argv[1]
+hasplot_html_file_path = sys.argv[2]
+output_html_file_path = sys.argv[3]
+
+from bs4 import BeautifulSoup
+with open(noplot_html_file_path, 'r', encoding='utf-8') as noplot_html_file:
+  noplot_html = noplot_html_file.read()
+with open(hasplot_html_file_path, 'r', encoding='utf-8') as hasplot_html_file:
+  hasplot_html = hasplot_html_file.read()
+
+noplot_html_soup = BeautifulSoup(noplot_html, 'html.parser')
+elems = noplot_html_soup.find_all("div", {"class": "sphx-glr-example-title"})
+if len(elems) == 0:
+  print("No match found, not replacing HTML content in "+noplot_html_file_path)
+elif len(elems) == 1:
+  print("Match found in "+noplot_html_file_path+". Replacing its content.")
+  elem = elems[0]
+  elem.replace_with(BeautifulSoup(hasplot_html, 'html.parser').find_all("div", {"class": "sphx-glr-example-title"})[0])
+  with open(output_html_file_path, "w", encoding='utf-8') as output_html_file:
+    output_html_file.write(str(noplot_html_soup))
+else:
+  raise Exception("Found more than one match in "+noplot_html_file_path+". Aborting.")
diff --git a/.jenkins/test_files_to_run.py b/.jenkins/test_files_to_run.py
new file mode 100644
index 00000000000..b4842a7dd75
--- /dev/null
+++ b/.jenkins/test_files_to_run.py
@@ -0,0 +1,20 @@
+#!/usr/bin/env python
+from get_files_to_run import get_all_files, calculate_shards
+from unittest import TestCase, main
+from functools import reduce
+
+class TestSharding(TestCase):
+    def test_no_sharding(self):
+        all_files=get_all_files()
+        sharded_files = calculate_shards(all_files, 1)
+        self.assertSetEqual(set(all_files), set(sharded_files[0]))
+
+    def test_sharding(self, num_shards=20):
+        all_files=get_all_files()
+        sharded_files = map(set, calculate_shards(all_files, num_shards))
+        self.assertSetEqual(set(all_files), reduce(lambda x,y: x.union(y), sharded_files, set()))
+
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.jenkins/validate_tutorials_built.py b/.jenkins/validate_tutorials_built.py
new file mode 100644
index 00000000000..75dd51dd789
--- /dev/null
+++ b/.jenkins/validate_tutorials_built.py
@@ -0,0 +1,84 @@
+from pathlib import Path
+from typing import List
+
+from bs4 import BeautifulSoup
+
+REPO_ROOT = Path(__file__).parent.parent
+
+# For every tutorial on this list, we should determine if it is ok to not run the tutorial (add a comment after
+# the file name to explain why, like intro.html), or fix the tutorial and remove it from this list).
+
+NOT_RUN = [
+    "beginner_source/basics/intro",  # no code
+    "beginner_source/introyt/introyt_index", # no code
+    "beginner_source/onnx/intro_onnx",
+    "beginner_source/profiler",
+    "beginner_source/saving_loading_models",
+    "beginner_source/introyt/captumyt",
+    "beginner_source/examples_nn/polynomial_module",
+    "beginner_source/examples_nn/dynamic_net",
+    "beginner_source/examples_nn/polynomial_optim",
+    "beginner_source/examples_autograd/polynomial_autograd",
+    "beginner_source/examples_autograd/polynomial_custom_function",
+    "intermediate_source/dqn_with_rnn_tutorial", #not working on 2.8 release reenable after 3514
+    "intermediate_source/mnist_train_nas",  # used by ax_multiobjective_nas_tutorial.py
+    "intermediate_source/torch_compile_conv_bn_fuser",
+    "intermediate_source/_torch_export_nightly_tutorial",  # does not work on release
+    "advanced_source/usb_semisup_learn", # fails with CUDA OOM error, should try on a different worker
+    "unstable_source/gpu_direct_storage", # requires specific filesystem + GPUDirect Storage to be set up
+    "recipes_source/recipes/tensorboard_with_pytorch",
+    "recipes_source/recipes/what_is_state_dict",
+    "recipes_source/recipes/profiler_recipe",
+    "recipes_source/recipes/warmstarting_model_using_parameters_from_a_different_model",
+    "recipes_source/recipes/benchmark",
+    "recipes_source/recipes/tuning_guide",
+    "recipes_source/recipes/zeroing_out_gradients",
+    "recipes_source/recipes/defining_a_neural_network",
+    "recipes_source/recipes/timer_quick_start",
+    "recipes_source/recipes/amp_recipe",
+    "recipes_source/recipes/Captum_Recipe",
+    "intermediate_source/tensorboard_profiler_tutorial", # reenable after 2.0 release.
+    "advanced_source/semi_structured_sparse", # reenable after 3303 is fixed.
+    "intermediate_source/torchrec_intro_tutorial.py", #failing with 2.8 reenable after 3498
+]
+
+def tutorial_source_dirs() -> List[Path]:
+    return [
+        p.relative_to(REPO_ROOT).with_name(p.stem[:-7])
+        for p in REPO_ROOT.glob("*_source")
+    ]
+
+
+def main() -> None:
+    docs_dir = REPO_ROOT / "docs"
+    html_file_paths = []
+    for tutorial_source_dir in tutorial_source_dirs():
+        glob_path = f"{tutorial_source_dir}/**/*.html"
+        html_file_paths += docs_dir.glob(glob_path)
+
+    should_not_run = [f'{x.replace("_source", "")}.html' for x in NOT_RUN]
+    did_not_run = []
+    for html_file_path in html_file_paths:
+        with open(html_file_path, "r", encoding="utf-8") as html_file:
+            html = html_file.read()
+        html_soup = BeautifulSoup(html, "html.parser")
+        elems = html_soup.find_all("p", {"class": "sphx-glr-timing"})
+        for elem in elems:
+            if (
+                "Total running time of the script: ( 0 minutes  0.000 seconds)"
+                in elem.text
+                and not any(html_file_path.match(file) for file in should_not_run)
+            ):
+                did_not_run.append(html_file_path.as_posix())
+
+    if len(did_not_run) != 0:
+        raise RuntimeError(
+            "The following file(s) are not known bad but ran in 0.000 sec, meaning that any "
+            + "python code in this tutorial probably didn't run:\n{}".format(
+                "\n".join(did_not_run)
+            )
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.lintrunner.toml b/.lintrunner.toml
new file mode 100644
index 00000000000..d3a1cbd9885
--- /dev/null
+++ b/.lintrunner.toml
@@ -0,0 +1,225 @@
+merge_base_with = "origin/main"
+
+# 4805a6ead6f1e7f32351056e2602be4e908f69b7 is from pytorch/pytorch main branch 2025-07-16
+
+[[linter]]
+code = 'SPACES'
+include_patterns = ['**']
+exclude_patterns = [
+    "_static/**/*",  # Contains some files that should usually not be linted
+    # All files below this should be checked and either removed from the
+    # exclusion list by fixing them or have a reason to be excluded.
+    "advanced_source/coding_ddpg.py",
+    "advanced_source/cpp_autograd.rst",
+    "advanced_source/cpp_custom_ops.rst",
+    "advanced_source/generic_join.rst",
+    "advanced_source/neural_style_tutorial.py",
+    "advanced_source/pendulum.py",
+    "advanced_source/privateuseone.rst",
+    "advanced_source/semi_structured_sparse.py",
+    "advanced_source/sharding.rst",
+    "advanced_source/torch_script_custom_classes/custom_class_project/custom_test.py",
+    "advanced_source/transformer__timeseries_cpp_tutorial/transformer_timeseries.cpp",
+    "advanced_source/usb_semisup_learn.py",
+    "beginner_source/blitz/README.txt",
+    "beginner_source/blitz/neural_networks_tutorial.py",
+    "beginner_source/dcgan_faces_tutorial.py",
+    "beginner_source/ddp_series_fault_tolerance.rst",
+    "beginner_source/ddp_series_theory.rst",
+    "beginner_source/examples_nn/polynomial_module.py",
+    "beginner_source/examples_nn/polynomial_nn.py",
+    "beginner_source/hta_intro_tutorial.rst",
+    "beginner_source/hta_trace_diff_tutorial.rst",
+    "beginner_source/hybrid_frontend/README.txt",
+    "beginner_source/hybrid_frontend_tutorial.rst",
+    "beginner_source/hyperparameter_tuning_tutorial.py",
+    "beginner_source/introyt/README.txt",
+    "beginner_source/introyt/autogradyt_tutorial.py",
+    "beginner_source/introyt/captumyt.py",
+    "beginner_source/introyt/introyt1_tutorial.py",
+    "beginner_source/introyt/modelsyt_tutorial.py",
+    "beginner_source/introyt/tensorboardyt_tutorial.py",
+    "beginner_source/introyt/tensors_deeper_tutorial.py",
+    "beginner_source/introyt/trainingyt.py",
+    "beginner_source/knowledge_distillation_tutorial.py",
+    "beginner_source/nlp/sequence_models_tutorial.py",
+    "beginner_source/onnx/export_control_flow_model_to_onnx_tutorial.py",
+    "beginner_source/onnx/onnx_registry_tutorial.py",
+    "beginner_source/pytorch_with_examples.rst",
+    "beginner_source/saving_loading_models.py",
+    "beginner_source/template_tutorial.py",
+    "beginner_source/transfer_learning_tutorial.py",
+    "intermediate_source/TCPStore_libuv_backend.rst",
+    "intermediate_source/ax_multiobjective_nas_tutorial.py",
+    "intermediate_source/compiled_autograd_tutorial.rst",
+    "intermediate_source/ddp_series_multinode.rst",
+    "intermediate_source/dqn_with_rnn_tutorial.py",
+    "intermediate_source/fx_profiling_tutorial.py",
+    "intermediate_source/inductor_debug_cpu.py",
+    "intermediate_source/jacobians_hessians.py",
+    "intermediate_source/optimizer_step_in_backward_tutorial.py",
+    "intermediate_source/per_sample_grads.py",
+    "intermediate_source/pruning_tutorial.py",
+    "intermediate_source/reinforcement_q_learning.py",
+    "intermediate_source/tensorboard_profiler_tutorial.py",
+    "intermediate_source/torch_compile_tutorial.py",
+    "intermediate_source/transformer_building_blocks.py",
+    "unstable_source/README.md",
+    "unstable_source/README.txt",
+    "unstable_source/gpu_direct_storage.py",
+    "unstable_source/inductor_cpp_wrapper_tutorial.rst",
+    "unstable_source/inductor_windows.rst",
+    "unstable_source/maskedtensor_advanced_semantics.py",
+    "unstable_source/max_autotune_on_CPU_tutorial.rst",
+    "unstable_source/vmap_recipe.py",
+    "recipes_source/README.txt",
+    "recipes_source/compiling_optimizer.rst",
+    "recipes_source/compiling_optimizer_lr_scheduler.py",
+    "recipes_source/distributed_optim_torchscript.rst",
+    "recipes_source/foreach_map.py",
+    "recipes_source/profile_with_itt.rst",
+    "recipes_source/recipes/Captum_Recipe.py",
+    "recipes_source/recipes/benchmark.py",
+    "recipes_source/recipes/changing_default_device.py",
+    "recipes_source/recipes/defining_a_neural_network.py",
+    "recipes_source/recipes/tensorboard_with_pytorch.py",
+    "recipes_source/recipes/timer_quick_start.py",
+    "recipes_source/recipes/tuning_guide.py",
+    "recipes_source/recipes/warmstarting_model_using_parameters_from_a_different_model.py",
+    "recipes_source/recipes/what_is_state_dict.py",
+    "recipes_source/torch_compile_caching_tutorial.rst",
+    "recipes_source/torch_compile_torch_function_modes.py",
+    "recipes_source/torch_compile_user_defined_triton_kernel_tutorial.py",
+    "recipes_source/torch_compiler_set_stance_tutorial.py",
+    "recipes_source/torch_export_aoti_python.py",
+    "recipes_source/xeon_run_cpu.rst",
+    "advanced_source/cpp_export.rst",
+    "advanced_source/torch-script-parallelism.rst",
+    "advanced_source/torch_script_custom_classes.rst",
+    "advanced_source/torch_script_custom_ops.rst",
+    "recipes_source/torchscript_inference.rst",
+]
+init_command = [
+    'python3',
+    'tools/linter/adapters/run_from_link.py',
+    '--lint-name=grep_linter.py',
+    '--lint-link=https://raw.githubusercontent.com/pytorch/pytorch/4805a6ead6f1e7f32351056e2602be4e908f69b7/tools/linter/adapters/grep_linter.py',
+    '--',
+    '--dry-run={{DRYRUN}}',
+]
+command = [
+    'python3',
+    'tools/linter/adapters/run_from_link.py',
+    '--run-lint',
+    '--lint-name=grep_linter.py',
+    '--',
+    '--pattern=[[:blank:]]$',
+    '--linter-name=SPACES',
+    '--error-name=trailing spaces',
+    '--replace-pattern=s/[[:blank:]]+$//',
+    """--error-description=\
+        This line has trailing spaces; please remove them.\
+    """,
+    '--',
+    '@{{PATHSFILE}}'
+]
+
+[[linter]]
+code = 'TABS'
+include_patterns = ['**']
+exclude_patterns = [
+    "_static/**/*",  # Contains some files that should usually not be linted
+    ".lintrunner.toml",  # Ironically needs to contain the tab character to find in other files
+    "Makefile",  # Wants tabs for indentationo
+    # All files below this should be checked and either removed from the
+    # exclusion list by fixing them or have a reason to be excluded.
+    "advanced_source/README.txt",
+    "advanced_source/cpp_frontend.rst",
+    "advanced_source/torch_script_custom_ops.rst",
+    "beginner_source/README.txt",
+    "beginner_source/basics/tensorqs_tutorial.py",
+    "beginner_source/blitz/README.txt",
+    "beginner_source/blitz/tensor_tutorial.py",
+    "beginner_source/hybrid_frontend/README.txt",
+    "beginner_source/nlp/README.txt",
+    "beginner_source/nlp/pytorch_tutorial.py",
+    "intermediate_source/README.txt",
+    "intermediate_source/TP_tutorial.rst",
+    "intermediate_source/inductor_debug_cpu.py",
+    "unstable_source/README.txt",
+    "recipes_source/README.txt",
+    "recipes_source/recipes/README.txt",
+    "recipes_source/xeon_run_cpu.rst",
+]
+init_command = [
+    'python3',
+    'tools/linter/adapters/run_from_link.py',
+    '--lint-name=grep_linter.py',
+    '--lint-link=https://raw.githubusercontent.com/pytorch/pytorch/4805a6ead6f1e7f32351056e2602be4e908f69b7/tools/linter/adapters/grep_linter.py',
+    '--',
+    '--dry-run={{DRYRUN}}',
+]
+command = [
+    'python3',
+    'tools/linter/adapters/run_from_link.py',
+    '--run-lint',
+    '--lint-name=grep_linter.py',
+    '--',
+    # @lint-ignore TXT2
+    '--pattern=	',
+    '--linter-name=TABS',
+    '--error-name=saw some tabs',
+    '--replace-pattern=s/\t/    /',
+    """--error-description=\
+        This line has tabs; please replace them with spaces.\
+    """,
+    '--',
+    '@{{PATHSFILE}}'
+]
+
+[[linter]]
+code = 'NEWLINE'
+include_patterns=['**']
+exclude_patterns=[
+    "_static/**/*",  # Contains some files that should usually not be linted
+    # All files below this should be checked and either removed from the
+    # exclusion list by fixing them or have a reason to be excluded.
+    "advanced_source/extend_dispatcher.rst",
+    "advanced_source/neural_style_tutorial.py",
+    "advanced_source/sharding.rst",
+    "advanced_source/torch_script_custom_classes/custom_class_project/custom_test.py",
+    "advanced_source/transformer__timeseries_cpp_tutorial/transformer_timeseries.cpp",
+    "beginner_source/blitz/README.txt",
+    "beginner_source/dcgan_faces_tutorial.py",
+    "beginner_source/hta_trace_diff_tutorial.rst",
+    "beginner_source/hybrid_frontend/README.txt",
+    "beginner_source/nlp/pytorch_tutorial.py",
+    "beginner_source/template_tutorial.py",
+    "beginner_source/transfer_learning_tutorial.py",
+    "intermediate_source/custom_function_conv_bn_tutorial.py",
+    "intermediate_source/custom_function_double_backward_tutorial.rst",
+    "intermediate_source/forced_alignment_with_torchaudio_tutorial.rst",
+    "intermediate_source/nlp_from_scratch_index.rst",
+    "intermediate_source/pipeline_tutorial.rst",
+    "recipes_source/README.txt",
+    "recipes_source/script_optimized.rst",
+    "recipes_source/torch_compile_caching_configuration_tutorial.rst",
+    "recipes_source/torch_compile_caching_tutorial.rst",
+]
+init_command = [
+    'python3',
+    'tools/linter/adapters/run_from_link.py',
+    '--lint-name=newlines_linter.py',
+    '--lint-link=https://raw.githubusercontent.com/pytorch/pytorch/4805a6ead6f1e7f32351056e2602be4e908f69b7/tools/linter/adapters/newlines_linter.py',
+    '--',
+    '--dry-run={{DRYRUN}}',
+]
+command = [
+    'python3',
+    'tools/linter/adapters/run_from_link.py',
+    '--run-lint',
+    '--lint-name=newlines_linter.py',
+    '--',
+    '@{{PATHSFILE}}',
+]
+is_formatter = true
diff --git a/.lycheeignore b/.lycheeignore
new file mode 100644
index 00000000000..fc1e3f1fa85
--- /dev/null
+++ b/.lycheeignore
@@ -0,0 +1,17 @@
+# Used for links to be ignored during the link check.
+# Add link to file along with comment as to why it should be ignored
+
+#Example link in some of the tutorials that should be ignored
+file:///f:/libtmp/some_file
+
+#Ignore links with "file:///" to catch any other example links
+file:\/\/\/.*
+
+# Ignore colab link in the setting of conf.py
+https://pytorch.org/tutorials/beginner/colab/n
+
+# Ignore local host link from intermediate_source/tensorboard_tutorial.rst
+http://localhost:6006
+
+# Ignore local host link from advanced_source/cpp_frontend.rst
+https://www.uber.com/blog/deep-neuroevolution/
diff --git a/.pyspelling.yml b/.pyspelling.yml
new file mode 100644
index 00000000000..bce797e6559
--- /dev/null
+++ b/.pyspelling.yml
@@ -0,0 +1,163 @@
+spellchecker: aspell
+matrix:
+- name: python
+  sources:
+  - "**/*.py"
+  dictionary:
+    wordlists:
+      - en-wordlist.txt
+  pipeline:
+  - pyspelling.filters.python:
+      group_comments: true
+  - pyspelling.filters.context:
+      context_visible_first: true
+      delimiters:
+        # Exclude figure rST tags
+        - open: '\.\.\s+(figure|literalinclude|math|image|grid)::'
+          close: '\n'
+        # Exclude roles:
+        - open: ':(?:(class|py:mod|mod|func|meth|obj)):`'
+          content: '[^`]*'
+          close: '`'
+        # Exclude reStructuredText hyperlinks
+        - open: '\s'
+          content: '\w*'
+          close: '_'
+        # Exclude raw directive
+        - open: '\.\. (raw)::.*$\n*'
+          close: '\n'
+        # Exclude Python coding directives
+        - open: '-\*- coding:'
+          close: '\n'
+        # Exclude Authors:
+        - open: 'Author(|s):'
+          close: '\n'
+        # Exclude .rst directives:
+        - open: ':math:`.*`'
+          close: ' '
+        # Ignore multiline content in codeblock
+        - open: '(?s)^::\n\n  '
+          close: '^\n'
+        # Ignore reStructuredText block directives
+        - open: '\.\. (code-block|math)::.*$\n*'
+          content: '(?P<first>(^(?P<indent>[ ]+).*$\n))(?P<other>(^([ \t]+.*|[ \t]*)$\n)*)'
+          close: '(^(?![ \t]+.*$))'
+        # Ignore references like "[1] Author: Title"
+        - open: '\[\d\]'
+          close: '\n'
+  - pyspelling.filters.markdown:
+  - pyspelling.filters.html:
+      ignores:
+        - code
+        - pre
+  - pyspelling.filters.url:
+- name: reST
+  sources:
+  - "**/*.rst"
+  dictionary:
+    wordlists:
+      - en-wordlist.txt
+  pipeline:
+  - pyspelling.filters.text:
+  - pyspelling.filters.context:
+      context_visible_first: true
+      delimiters:
+      # Ignore text between inline back ticks
+      - open: '(div style|iframe).*'
+        close: '\n'
+      - open: '(- )?(?P<open>`+)'
+        close: '(?P=open)'
+      - open: ':figure:.*'
+        close: '\n'
+      # Ignore reStructuredText roles
+      - open: ':(?:(class|file|func|math|ref|octicon|meth|obj)):`'
+        content: '[^`]*'
+        close: '`'
+      - open: ':width:'
+        close: '$'
+      # Exclude raw directive
+      - open: '\.\. (raw|grid-item-card|galleryitem|includenodoc)::.*$\n*'
+        close: '\n'
+      # Ignore reStructuredText literals
+      - open: '::$'
+        close: '(?P<literal>(?:((?P<indent>[ ]+).*$)|(\n))+)'
+      # Ignore reStructuredText hyperlinks
+      - open: '\s'
+        content: '\w*'
+        close: '_'
+      # Ignore hyperlink in the DDP tutorials
+      - open: '`.*'
+        close: '`__'
+      # Ignore reStructuredText header ---
+      - open: '^'
+        content: '--*'
+        close: '$'
+      # Ignore reStructuredText header '''
+      - open: '^'
+        content: '''''*'
+        close: '$'
+      # Ignore reStructuredText block directives
+      - open: '\.\. (code-block|math|table)::.*$\n*'
+        content: '(?P<first>(^(?P<indent>[ ]+).*$\n))(?P<other>(^([ \t]+.*|[ \t]*)$\n)*)'
+        close: '(^(?![ \t]+.*$))'
+      - open: '\.\. (raw)::.*$\n*'
+        close: '^\s*$'
+      # Ignore reStructuredText substitution definitions
+      - open: '^\.\. \|[^|]+\|'
+        close: '$'
+      # Ignore reStructuredText substitutions
+      - open: '\|'
+        content: '[^|]*'
+        close: '\|_?'
+      # Ignore reStructuredText toctree
+      - open: '\.\.\s+toctree::'
+        close: '(?P<toctree>(?:((?P<indent>[ ]+).*$)|(\n))+)'
+      # Ignore directives
+      - open: '\.\.\s+(image|include|only)::'
+        close: '$'
+  - pyspelling.filters.url:
+- name: markdown
+  sources:
+  - '**/*.md'
+  dictionary:
+    wordlists:
+      - en-wordlist.txt
+  pipeline:
+  - pyspelling.filters.markdown:
+      markdown_extensions:
+        - markdown.extensions.extra:
+        - markdown.extensions.admonition:
+        - markdown.extensions.codehilite:
+        - markdown.extensions.meta:
+        - markdown.extensions.tables:
+        - markdown.extensions.toc:
+  - pyspelling.filters.html:
+      comments: false
+      ignores:
+        - code
+        - pre
+        - tt
+        - img
+        - a
+        - table
+        - thead
+        - tbody
+        - th
+        - tr
+        - td
+  - pyspelling.filters.context:
+      context_visible_first: true
+      delimiters:
+        # Ignore code blocks
+        - open: '```[a-z]*\n'
+          close: '```\n'
+        # Ignore inline code
+        - open: '`'
+          close: '`'
+        # Ignore links
+        - open: '\[([^]]*)\]'
+          close: '\([^)]*\)'
+        # Ignore HTML comments
+        - open: '<!--'
+          close: '-->'
+  - pyspelling.filters.url:
diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
new file mode 100644
index 00000000000..b91e23b17c0
--- /dev/null
+++ b/CODE_OF_CONDUCT.md
@@ -0,0 +1,76 @@
+# Code of Conduct
+
+## Our Pledge
+
+In the interest of fostering an open and welcoming environment, we as
+contributors and maintainers pledge to make participation in our project and
+our community a harassment-free experience for everyone, regardless of age, body
+size, disability, ethnicity, sex characteristics, gender identity and expression,
+level of experience, education, socio-economic status, nationality, personal
+appearance, race, religion, or sexual identity and orientation.
+
+## Our Standards
+
+Examples of behavior that contributes to creating a positive environment
+include:
+
+* Using welcoming and inclusive language
+* Being respectful of differing viewpoints and experiences
+* Gracefully accepting constructive criticism
+* Focusing on what is best for the community
+* Showing empathy towards other community members
+
+Examples of unacceptable behavior by participants include:
+
+* The use of sexualized language or imagery and unwelcome sexual attention or
+advances
+* Trolling, insulting/derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or electronic
+address, without explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+professional setting
+
+## Our Responsibilities
+
+Project maintainers are responsible for clarifying the standards of acceptable
+behavior and are expected to take appropriate and fair corrective action in
+response to any instances of unacceptable behavior.
+
+Project maintainers have the right and responsibility to remove, edit, or
+reject comments, commits, code, wiki edits, issues, and other contributions
+that are not aligned to this Code of Conduct, or to ban temporarily or
+permanently any contributor for other behaviors that they deem inappropriate,
+threatening, offensive, or harmful.
+
+## Scope
+
+This Code of Conduct applies within all project spaces, and it also applies when
+an individual is representing the project or its community in public spaces.
+Examples of representing a project or community include using an official
+project e-mail address, posting via an official social media account, or acting
+as an appointed representative at an online or offline event. Representation of
+a project may be further defined and clarified by project maintainers.
+
+## Enforcement
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported by contacting the project team at <conduct@pytorch.org>. All
+complaints will be reviewed and investigated and will result in a response that
+is deemed necessary and appropriate to the circumstances. The project team is
+obligated to maintain confidentiality with regard to the reporter of an incident.
+Further details of specific enforcement policies may be posted separately.
+
+Project maintainers who do not follow or enforce the Code of Conduct in good
+faith may face temporary or permanent repercussions as determined by other
+members of the project's leadership.
+
+## Attribution
+
+This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
+available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
+
+[homepage]: https://www.contributor-covenant.org
+
+For answers to common questions about this code of conduct, see
+https://www.contributor-covenant.org/faq
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 00000000000..9c52182e85a
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,367 @@
+# Contributing to tutorials
+
+We want to make contributing to this project as easy and transparent as
+possible. This file covers information on flagging issues, contributing
+updates to existing tutorials--and also submitting new tutorials.
+
+NOTE: This guide assumes that you have your GitHub account properly
+configured, such as having an SSH key. If this is your first time
+contributing on GitHub, see the [GitHub
+Documentation](https://docs.github.com/en/get-started/quickstart/contributing-to-projects)
+on contributing to projects.
+
+
+# Issues
+
+We use [GitHub Issues](https://github.com/pytorch/tutorials/issues) to
+track public bugs. Please ensure your description is clear and has
+sufficient instructions to be able to reproduce the issue.
+
+
+# Security Bugs
+
+Facebook has a [bounty program](https://www.facebook.com/whitehat/) for
+the safe disclosure of security bugs. For these types of issues, please
+go through the process outlined on that page and do not file a public
+issue.
+
+# Contributor License Agreement ("CLA")
+
+In order to accept a pull request, you need to submit a CLA. You only
+need to do this once and you will be able to work on all of Facebook's
+open source projects, not just PyTorch.
+
+Complete your CLA here: <https://code.facebook.com/cla>
+
+
+# License
+
+By contributing to the tutorials, you agree that your contributions will
+be licensed as described in the `LICENSE` file in the root directory of
+this source tree.
+
+
+# Updates to existing tutorials
+
+We welcome your pull requests (PR) for updates and fixes.
+
+1. If you haven't already, complete the Contributor License Agreement
+   ("CLA").
+1. Fork the repo and create a branch from
+   [`main`](https://github.com/pytorch/tutorials).
+1. Test your code.
+1. Lint your code with a tool such as
+   [Pylint](https://pylint.pycqa.org/en/latest/).
+1. Submit your PR for review.
+
+
+# New Tutorials
+
+There are three types of tutorial content that we host on
+[`pytorch.org/tutorials`](https://github.com/pytorch/tutorials):
+
+* **Interactive tutorials** are authored and submitted as Python files.
+  The build system  converts these into Jupyter notebooks and HTML. The
+  code in these tutorials is run every time they are built. To keep
+  these tutorials up and running all their package dependencies need to
+  be resolved--which makes it more challenging to maintain this type of
+  tutorial.
+
+* **Non-interactive tutorials** are authored and submitted as
+  reStructuredText files. The build system only converts them into HTML;
+  the code in them does not run on build. These tutorials are easier to
+  create and maintain but they do not provide an interactive experience.
+
+
+* **Recipes** are tutorials that provide bite-sized, actionable
+  examples of how to use specific features, which differentiates them
+  from full-length tutorials. Recipes can be interactive or
+  non-interactive.
+
+
+# Managing data that is used by your tutorial
+
+Your tutorial might depend on external data, such as pre-trained models,
+training data, or test data. We recommend storing this data in a
+commonly-used storage service, such as Amazon S3, and instructing your
+users to download the data at the beginning of your tutorial.
+
+To download your data add a function to the [download.py](https://github.com/pytorch/tutorials/blob/main/.jenkins/download_data.py)
+script. Follow the same pattern as other download functions.
+Please do not add download logic to `Makefile` as it will incur download overhead for all CI shards.
+
+# Python packages used by your tutorial
+
+If your tutorial has dependencies that are not already defined in
+`requirements.txt`, you should add them to that file. We recommend that
+you use only mature, well-supported packages in your tutorial. Packages
+that are obscure or not well-maintained may break as a result of, for
+example, updates to Python or PyTorch or other packages. If your
+tutorial fails to build in our Continuous Integration (CI) system, we
+might contact you in order to resolve the issue.
+
+
+# Deprecation of tutorials
+
+Under some circumstances, we might deprecate--and subsequently
+archive--a tutorial removing it from the site. For example, if the
+tutorial breaks in our CI and we are not able to resolve the issue and
+are also not able to reach you, we might archive the tutorial. In these
+situations, resolving the breaking issue would normally be sufficient to
+make the tutorial available again.
+
+Another situation in which a tutorial might be deprecated is if it
+consistently receives low ratings--or low usage--by the community. Again,
+if this occurs, we will attempt to contact you.
+
+If we identify, or suspect, that your tutorial--or a package that your
+tutorial uses--has a **security or privacy** issue, we will immediately
+take the tutorial off the site.
+
+
+# Guidance for authoring tutorials and recipes
+
+In this section, we describe the process for creating tutorials and
+recipes for Pytorch.
+
+The first step is to decide which type of tutorial you want to create,
+taking into account how much support you can provide to keep the
+tutorial up-to-date. Ideally, your tutorial should demonstrate PyTorch
+functionality that is not duplicated in other tutorials.
+
+As described earlier, tutorials are resources that provide a holistic
+end-to-end understanding of how to use PyTorch. Recipes are scoped
+examples of how to use specific features; the goal of a recipe is to
+teach readers how to easily leverage features of PyTorch for their
+needs. Tutorials and recipes are always _actionable_. If the material is
+purely informative, consider adding it to the API docs instead.
+
+View our current [full-length tutorials](https://pytorch.org/tutorials/).
+
+To create actionable tutorials, start by identifying _learning
+objectives_, which are the end goals. Working backwards from these
+objectives will help to eliminate extraneous information.
+
+
+## Learning objectives ##
+
+To create the learning objectives, focus on what the user will
+implement. Set expectations by explicitly stating what the recipe will
+cover and what users will implement by the end. Here are some examples:
+
+- Create a custom dataset
+- Integrate a dataset using a library
+- Iterate over samples in the dataset
+- Apply a transform to the dataset
+
+
+## Voice and writing style ##
+
+Write for a global audience with an instructive and directive voice.
+
+- PyTorch has a global audience; use clear, easy to understand
+  language. Avoid idioms or other figures of speech.
+- To keep your instructions concise, use
+  [active voice](https://writing.wisc.edu/handbook/style/ccs_activevoice/) as much as possible.
+- For a short guide on the essentials of writing style,
+  [The Elements of Style](https://www.gutenberg.org/files/37134/37134-h/37134-h.htm)
+  is invaluable.
+- For extensive guidance on technical-writing style, the Google developer documentation
+  [google style](https://developers.google.com/style)
+  is a great resource.
+- Think of the process as similar to creating a (really practical)
+  Medium post.
+
+
+## Structure ##
+
+We recommend that tutorials use the following structure which guides users through the learning experience and provides appropriate context:
+
+1. Introduction
+1. Motivation: Why is this topic important?
+1. Link to relevant research papers or other background material.
+1. Learning objectives: Clearly state what the tutorial covers and what
+   users will implement by the end. For example: Provide a summary of
+   how the Integrated Gradients feature works and how to implement it
+   using Captum. The
+   [TensorBoard](https://pytorch.org/tutorials/intermediate/tensorboard_tutorial.html)
+   tutorial provides a good example of how to specify learning
+   objectives.
+1. Setup and requirements. Call out any required setup or data
+   downloads.
+1. Step-by-step instructions. Ideally, the steps in the tutorial should
+   map back to the learning objectives. Consider adding comments in the
+   code that correspond to these steps and that help to clarify what
+   each section of the code is doing.
+1. Link to relevant [PyTorch
+   documentation](https://pytorch.org/docs/stable/index.html). This
+   helps readers have context for the tutorial source code and better
+   understand how and why it implements the technique you’re
+   demonstrating.
+1. Recap/Conclusion: Summarize the steps and concepts covered. Highlight
+   key takeaways.
+1. (Optional) Additional practice exercises for users to test their
+   knowledge. An example is [NLP From Scratch: Generating Names with a
+   Character-Level RNN tutorial](https://pytorch.org/tutorials/intermediate/char_rnn_generation_tutorial.html#exercises).
+1. Additional resources for more learning, such as documentation, other
+   tutorials, or relevant research.
+
+
+## Example Tutorials ##
+
+The following tutorials do a good job of demonstrating the ideas
+described in the preceding sections:
+
+- [Chatbot Tutorial](https://pytorch.org/tutorials/beginner/chatbot_tutorial.html)
+- [Tensorboard Tutorial](https://pytorch.org/tutorials/intermediate/tensorboard_tutorial.html)
+- [NLP From Scratch: Generating Names with a Character-Level RNN
+Tutorial](https://pytorch.org/tutorials/intermediate/char_rnn_generation_tutorial.html)
+
+If you are creating a recipe, [this is a good
+example.](https://github.com/pytorch/tutorials/blob/main/recipes_source/recipes/what_is_state_dict.py)
+
+
+# Submission Process #
+
+Submit your tutorial as either a Python (`.py`) file or a
+reStructuredText (`.rst`) file. For Python files, the filename for your
+tutorial should end in "`_tutorial.py`"; for example,
+"`cool_pytorch_feature_tutorial.py`".
+
+Do not submit a Jupyter notebook. If you develop your tutorial in
+Jupyter, you'll need to convert it to Python. This
+[script](https://gist.github.com/chsasank/7218ca16f8d022e02a9c0deb94a310fe)
+is one option for performing this conversion.
+
+For Python files, our CI system runs your code during each build.
+
+
+## Add Your Tutorial Code ##
+
+1. [Fork and
+   clone](https://docs.github.com/en/get-started/quickstart/contributing-to-projects)
+   the repo:
+   [https://github.com/pytorch/tutorials](https://github.com/pytorch/tutorials)
+
+1. Put the tutorial in one of the
+   [`beginner_source`](https://github.com/pytorch/tutorials/tree/main/beginner_source),
+   [`intermediate_source`](https://github.com/pytorch/tutorials/tree/main/intermediate_source),
+   [`advanced_source`](https://github.com/pytorch/tutorials/tree/main/advanced_source)
+   based on the technical level of the content. For recipes, put the
+   recipe in
+   [`recipes_source`](https://github.com/pytorch/tutorials/tree/main/recipes_source).
+   In addition, for recipes, add the recipe in the recipes
+   [README.txt](https://github.com/pytorch/tutorials/blob/main/recipes_source/recipes/README.txt)
+   file.
+
+
+## Include Your Tutorial in `index.rst`#
+
+In order for your tutorial to appear on the website, and through tag
+search, you need to include it in `index.rst`, or for recipes, in
+`recipes_index.rst`.
+
+1. Open the relevant file
+   [`index.rst`](https://github.com/pytorch/tutorials/blob/main/index.rst)
+   or
+   [`recipes_index.rst`](https://github.com/pytorch/tutorials/blob/main/recipes_index.rst)
+1. Add a _card_ in reStructuredText format similar to the following:
+
+```
+.. customcarditem::
+   :header: Learn the Basics # Tutorial title
+   :card_description: A step-by-step guide to building a complete ML workflow with PyTorch.  # Short description
+   :image: _static/img/thumbnails/cropped/60-min-blitz.png  # Image that appears with the card
+   :link: beginner/basics/intro.html
+   :tags: Getting-Started
+```
+
+
+### Link ###
+
+The `link` should be the path to your tutorial in the source tree. For
+example, if the tutorial is in `beginner_source`, the link will be
+`beginner_source/rest/of/the/path.html`
+
+
+### Tags ###
+
+Choose tags from the existing tags in the file. Reach out to a project
+maintainer to create a new tag. The list of tags should not have any
+white space between the words. Multi-word tags, such as “Getting
+Started”, should be hyphenated: Getting-Started. Otherwise, the tutorial
+might fail to build, and the cards will not display properly.
+
+
+### Image ###
+
+Add a thumbnail to the
+[`_static/img/thumbnails/cropped`](https://github.com/pytorch/tutorials/tree/main/_static/img/thumbnails/cropped)
+directory. Images that render the best are square--that is, they have
+equal `x` and `y` dimensions--and also have high resolution. [Here is an
+example](https://github.com/pytorch/tutorials/blob/main/_static/img/thumbnails/cropped/loading-data.PNG).
+
+## `toctree` ##
+
+1. Add your tutorial under the corresponding toctree (also in
+   `index.rst`). For example, if you are adding a tutorial that
+   demonstrates the PyTorch ability to process images or video, add it
+   under `Image and Video`:
+
+```
+.. toctree::
+   :maxdepth: 2
+   :includehidden:
+   :hidden:
+   :caption: Image and Video
+
+   intermediate/torchvision_tutorial
+   beginner/my-new-tutorial
+```
+
+
+## Test Your Tutorial Locally ##
+
+The following command builds an HTML version of the tutorial website.
+
+```
+make html-noplot
+```
+
+This command does not run your tutorial code. To build the tutorial in a
+way that executes the code, use `make docs`. However, unless you have a
+GPU-powered machine and a proper PyTorch CUDA setup, running this `make`
+command locally won't work. The continuous integration (CI) system will
+test your tutorial when you submit your PR.
+
+
+## Submit the PR ##
+
+NOTE: Please do not use [ghstack](https://github.com/ezyang/ghstack). We
+do not support ghstack in the [`pytorch/tutorials`](https://github.com/pytorch/tutorials) repo.
+
+Submit the changes as a PR to the main branch of
+[`pytorch/tutorials`](https://github.com/pytorch/tutorials).
+
+1. Add your changes, commit, and push:
+
+    ```
+    git add -A
+    git commit -m "Add <mytutorial>"
+    git push --set-upstream mybranch
+    ```
+
+1. Submit the PR and tag individuals on the PyTorch project who can review
+   your PR.
+1. Address all feedback comments from your reviewers.
+1. Make sure all CI checks are passing.
+
+Once you submit your PR, you can see a generated Netlify preview of your
+build. You can see an example Netlify preview at the following URL:
+
+>  <https://deploy-preview-954--pytorch-tutorials-preview.netlify.app/>
+
+
+## Do not merge the PR yourself ##
+
+Please **DO NOT MERGE** your own PR; the tutorial won't be published. In order to avoid potential build breaks with the tutorials site, only certain maintainers can authorize publishing.
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 00000000000..338dffbfe74
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,29 @@
+BSD 3-Clause License
+
+Copyright (c) 2017-2022, Pytorch contributors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+* Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/Makefile b/Makefile
index ec18f71d1f4..7fcf1de6636 100644
--- a/Makefile
+++ b/Makefile
@@ -1,14 +1,21 @@
 # Minimal makefile for Sphinx documentation
 #
 
+# Locale
+export LC_ALL=C
+
 # You can set these variables from the command line.
-SPHINXOPTS    =
+SPHINXOPTS    ?=
 SPHINXBUILD   = sphinx-build
 SPHINXPROJ    = PyTorchTutorials
 SOURCEDIR     = .
 BUILDDIR      = _build
+DATADIR       = _data
 GH_PAGES_SOURCES = $(SOURCEDIR) Makefile
 
+ZIPOPTS       ?= -qo
+TAROPTS       ?=
+
 # Put it first so that "make" without argument is like "make help".
 help:
 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
@@ -18,37 +25,72 @@ help:
 # Catch-all target: route all unknown targets to Sphinx using the new
 # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
 %: Makefile
-	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -v
 
 download:
-	# transfer learning tutorial data
-	wget -N https://download.pytorch.org/tutorial/hymenoptera_data.zip
-	unzip -o hymenoptera_data.zip -d beginner_source
-	
-	# nlp tutorial data
-	wget -N https://download.pytorch.org/tutorial/data.zip
-	unzip -o data.zip -d intermediate_source
-	
+	# IMPORTANT NOTE: Please make sure your dataset is downloaded to *_source/data folder,
+	# otherwise CI might silently break.
+
+	# NOTE: Please consider using the Step1 and one of Step2 for new dataset,
+	# [something] should be replaced with the actual value.
+	# Step1. DOWNLOAD: wget -nv -N [SOURCE_FILE] -P $(DATADIR)
+	# Step2-1. UNZIP: unzip -o $(DATADIR)/[SOURCE_FILE] -d [*_source/data/]
+	# Step2-2. UNTAR: tar -xzf $(DATADIR)/[SOURCE_FILE] -C [*_source/data/]
+	# Step2-3. AS-IS: cp $(DATADIR)/[SOURCE_FILE] [*_source/data/]
+
+	# Run structured downloads first (will also make directories
+	python3 .jenkins/download_data.py
+
 	# data loader tutorial
-	wget -N https://download.pytorch.org/tutorial/faces.zip
-	unzip -o faces.zip -d beginner_source
-	
+	wget -nv -N https://download.pytorch.org/tutorial/faces.zip -P $(DATADIR)
+	unzip $(ZIPOPTS) $(DATADIR)/faces.zip -d beginner_source/data/
+
+	wget -nv -N https://download.pytorch.org/models/tutorials/4000_checkpoint.tar -P $(DATADIR)
+	cp $(DATADIR)/4000_checkpoint.tar beginner_source/data/
+
 	# neural style images
-	rm -rf advanced_source/images/
-	cp -r _static/img/neural-style/ advanced_source/images/
+	rm -rf advanced_source/data/images/ || true
+	mkdir -p advanced_source/data/images/
+	cp -r _static/img/neural-style/ advanced_source/data/images/
+
+	# Download dataset for beginner_source/hybrid_frontend/introduction_to_hybrid_frontend_tutorial.py
+	wget -nv -N https://s3.amazonaws.com/pytorch-tutorial-assets/iris.data -P $(DATADIR)
+	cp $(DATADIR)/iris.data beginner_source/data/
+
+	# Download dataset for beginner_source/chatbot_tutorial.py
+	wget -nv -N https://s3.amazonaws.com/pytorch-tutorial-assets/cornell_movie_dialogs_corpus_v2.zip -P $(DATADIR)
+	unzip $(ZIPOPTS) $(DATADIR)/cornell_movie_dialogs_corpus_v2.zip -d beginner_source/data/
+
+	# Download PennFudanPed dataset for intermediate_source/torchvision_tutorial.py
+	wget https://www.cis.upenn.edu/~jshi/ped_html/PennFudanPed.zip -P $(DATADIR)
+	unzip -o $(DATADIR)/PennFudanPed.zip -d intermediate_source/data/
 
+download-last-reviewed-json:
+	@echo "Downloading tutorials-review-data.json..."
+	curl -o tutorials-review-data.json https://raw.githubusercontent.com/pytorch/tutorials/refs/heads/last-reviewed-data-json/tutorials-review-data.json
+	@echo "Finished downloading tutorials-review-data.json."
 docs:
 	make download
+	make download-last-reviewed-json
 	make html
+	@python .jenkins/insert_last_verified.py $(BUILDDIR)/html
 	rm -rf docs
 	cp -r $(BUILDDIR)/html docs
 	touch docs/.nojekyll
+	rm -rf tutorials-review-data.json
 
 html-noplot:
 	$(SPHINXBUILD) -D plot_gallery=0 -b html $(SPHINXOPTS) "$(SOURCEDIR)" "$(BUILDDIR)/html"
+	# bash .jenkins/remove_invisible_code_block_batch.sh "$(BUILDDIR)/html"
 	@echo
+	make download-last-reviewed-json
 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
+	@echo "Running post-processing script to insert 'Last Verified' dates..."
+	@python .jenkins/insert_last_verified.py $(BUILDDIR)/html
+	rm -rf tutorials-review-data.json
 
 clean-cache:
 	make clean
-	rm -rf advanced beginner intermediate
+	rm -rf advanced beginner intermediate recipes
+	# remove additional python files downloaded for torchvision_tutorial.py
+	rm -rf intermediate_source/engine.py intermediate_source/utils.py intermediate_source/transforms.py intermediate_source/coco_eval.py intermediate_source/coco_utils.py
diff --git a/README.md b/README.md
index c80254b540a..3b858a3882b 100644
--- a/README.md
+++ b/README.md
@@ -3,24 +3,77 @@
 
 All the tutorials are now presented as sphinx style documentation at:
 
-## [http://pytorch.org/tutorials](http://pytorch.org/tutorials)
+## [https://pytorch.org/tutorials](https://pytorch.org/tutorials)
 
+# Asking a question
 
+If you have a question about a tutorial, post in https://dev-discuss.pytorch.org/ rather than creating an issue in this repo. Your question will be answered much faster on the dev-discuss forum.
+
+# Submitting an issue
+
+You can submit the following types of issues:
+
+* Feature request - request a new tutorial to be added. Please explain why this tutorial is needed and how it demonstrates PyTorch value.
+* Bug report - report a failure or outdated information in an existing tutorial. When submitting a bug report, please run: `python3 -m torch.utils.collect_env` to get information about your environment and add the output to the bug report.
 
 # Contributing
 
-We use sphinx-gallery's [notebook styled examples](https://sphinx-gallery.readthedocs.io/en/latest/tutorials/plot_notebook.html#sphx-glr-tutorials-plot-notebook-py) to create the tutorials. Syntax is very simple. In essence, you write a slightly well formatted python file and it shows up as documentation page.
+We use sphinx-gallery's [notebook styled examples](https://sphinx-gallery.github.io/stable/tutorials/index.html) to create the tutorials. Syntax is very simple. In essence, you write a slightly well formatted Python file and it shows up as an HTML page. In addition, a Jupyter notebook is autogenerated and available to run in Google Colab.
+
+Here is how you can create a new tutorial (for a detailed description, see [CONTRIBUTING.md](./CONTRIBUTING.md)):
+
+NOTE: Before submitting a new tutorial, read [PyTorch Tutorial Submission Policy](./tutorial_submission_policy.md).
+
+1. Create a Python file. If you want it executed while inserted into documentation, save the file with the suffix `tutorial` so that the file name is `your_tutorial.py`.
+2. Put it in one of the `beginner_source`, `intermediate_source`, `advanced_source` directory based on the level of difficulty. If it is a recipe, add it to `recipes_source`. For tutorials demonstrating unstable prototype features, add to the `prototype_source`.
+3. For Tutorials (except if it is a prototype feature), include it in the `toctree` directive and create a `customcarditem` in [index.rst](./index.rst).
+4. For Tutorials (except if it is a prototype feature), create a thumbnail in the [index.rst file](https://github.com/pytorch/tutorials/blob/main/index.rst) using a command like `.. customcarditem:: beginner/your_tutorial.html`. For Recipes, create a thumbnail in the [recipes_index.rst](https://github.com/pytorch/tutorials/blob/main/recipes_index.rst)
+
+If you are starting off with a Jupyter notebook, you can use [this script](https://gist.github.com/chsasank/7218ca16f8d022e02a9c0deb94a310fe) to convert the notebook to Python file. After conversion and addition to the project, please make sure that section headings and other things are in logical order.
+
+## Building locally
+
+The tutorial build is very large and requires a GPU. If your machine does not have a GPU device, you can preview your HTML build without actually downloading the data and running the tutorial code:
+
+1. Install required dependencies by running: `pip install -r requirements.txt`.
+
+> Typically, you would run either in `conda` or `virtualenv`. If you want to use `virtualenv`, in the root of the repo, run: `virtualenv venv`, then `source venv/bin/activate`.
+
+- If you have a GPU-powered laptop, you can build using `make docs`. This will download the data, execute the tutorials and build the documentation to `docs/` directory. This might take about 60-120 min for systems with GPUs. If you do not have a GPU installed on your system, then see next step.
+- You can skip the computationally intensive graph generation by running `make html-noplot` to build basic html documentation to `_build/html`. This way, you can quickly preview your tutorial.
+
+## Building a single tutorial
+
+You can build a single tutorial by using the `GALLERY_PATTERN` environment variable. For example to run only `neural_style_transfer_tutorial.py`, run:
+
+```
+GALLERY_PATTERN="neural_style_transfer_tutorial.py" make html
+```
+or
+
+```
+GALLERY_PATTERN="neural_style_transfer_tutorial.py" sphinx-build . _build
+```
+
+The `GALLERY_PATTERN` variable respects regular expressions.
+
+## Spell Check
+You can run pyspelling to check for spelling errors in the tutorials. To check only Python files, run pyspelling -n python. To check only .rst files, use pyspelling -n reST. Currently, .rst spell checking is limited to the beginner/ directory. Contributions to enable spell checking in other directories are welcome!
+
+
+```
+pyspelling          # full check (~3 mins)
+pyspelling -n python  # Python files only
+pyspelling -n reST    # reST files (only beginner/ dir currently included)
+```
+
 
-Here's how to create a new tutorial:
-1. Create a notebook styled python file. If you want it executed while inserted into documentation, save the file with suffix `tutorial` so that file name is `your_tutorial.py`.
-2. Put it in one of the beginner_source, intermediate_source, advanced_source based on the level.
-2. Include it in the right TOC tree at index.rst
-3. Create a thumbnail in the index file using a command like `.. galleryitem:: beginner/your_tutorial.py`. (This is a custom directive. See `custom_directives.py` for more info.) 
+## About contributing to PyTorch Documentation and Tutorials
+* You can find information about contributing to PyTorch documentation in the
+PyTorch Repo [README.md](https://github.com/pytorch/pytorch/blob/master/README.md) file.
+* Additional information can be found in [PyTorch CONTRIBUTING.md](https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md).
 
-In case you prefer to write your tutorial in jupyter, you can use [this script](https://gist.github.com/chsasank/7218ca16f8d022e02a9c0deb94a310fe) to convert the notebook to python file. After conversion and addition to the project, please make sure the sections headings etc are in logical order.
 
-## Building
+## License
 
-- Start with installing torch and torchvision. Install other requirements using `pip install -r requirements.txt`
-- Then you can build using `make docs`. This will download the data, execute the tutorials and build the documentation to `docs/` directory. However, this will take about 30-60 min based on your system. 
-- You can skip the execution by running `make html-noplot` to build html documentation to `_build/html`. This way, you can quickly preview your tutorial.
+PyTorch Tutorials is BSD licensed, as found in the LICENSE file.
diff --git a/_static/ajax-loader.gif b/_static/ajax-loader.gif
new file mode 100755
index 00000000000..61faf8cab23
Binary files /dev/null and b/_static/ajax-loader.gif differ
diff --git a/_static/basic.css b/_static/basic.css
new file mode 100755
index 00000000000..19ced1057ae
--- /dev/null
+++ b/_static/basic.css
@@ -0,0 +1,665 @@
+/*
+ * basic.css
+ * ~~~~~~~~~
+ *
+ * Sphinx stylesheet -- basic theme.
+ *
+ * :copyright: Copyright 2007-2018 by the Sphinx team, see AUTHORS.
+ * :license: BSD, see LICENSE for details.
+ *
+ */
+
+/* -- main layout ----------------------------------------------------------- */
+
+div.clearer {
+    clear: both;
+}
+
+/* -- relbar ---------------------------------------------------------------- */
+
+div.related {
+    width: 100%;
+    font-size: 90%;
+}
+
+div.related h3 {
+    display: none;
+}
+
+div.related ul {
+    margin: 0;
+    padding: 0 0 0 10px;
+    list-style: none;
+}
+
+div.related li {
+    display: inline;
+}
+
+div.related li.right {
+    float: right;
+    margin-right: 5px;
+}
+
+/* -- sidebar --------------------------------------------------------------- */
+
+div.sphinxsidebarwrapper {
+    padding: 10px 5px 0 10px;
+}
+
+div.sphinxsidebar {
+    float: left;
+    width: 230px;
+    margin-left: -100%;
+    font-size: 90%;
+    word-wrap: break-word;
+    overflow-wrap : break-word;
+}
+
+div.sphinxsidebar ul {
+    list-style: none;
+}
+
+div.sphinxsidebar ul ul,
+div.sphinxsidebar ul.want-points {
+    margin-left: 20px;
+    list-style: square;
+}
+
+div.sphinxsidebar ul ul {
+    margin-top: 0;
+    margin-bottom: 0;
+}
+
+div.sphinxsidebar form {
+    margin-top: 10px;
+}
+
+div.sphinxsidebar input {
+    border: 1px solid #98dbcc;
+    font-family: sans-serif;
+    font-size: 1em;
+}
+
+div.sphinxsidebar #searchbox input[type="text"] {
+    float: left;
+    width: 80%;
+    padding: 0.25em;
+    box-sizing: border-box;
+}
+
+div.sphinxsidebar #searchbox input[type="submit"] {
+    float: left;
+    width: 20%;
+    border-left: none;
+    padding: 0.25em;
+    box-sizing: border-box;
+}
+
+
+img {
+    border: 0;
+    max-width: 100%;
+}
+
+/* -- search page ----------------------------------------------------------- */
+
+ul.search {
+    margin: 10px 0 0 20px;
+    padding: 0;
+}
+
+ul.search li {
+    padding: 5px 0 5px 20px;
+    background-image: url(file.png);
+    background-repeat: no-repeat;
+    background-position: 0 7px;
+}
+
+ul.search li a {
+    font-weight: bold;
+}
+
+ul.search li div.context {
+    color: #888;
+    margin: 2px 0 0 30px;
+    text-align: left;
+}
+
+ul.keywordmatches li.goodmatch a {
+    font-weight: bold;
+}
+
+/* -- index page ------------------------------------------------------------ */
+
+table.contentstable {
+    width: 90%;
+    margin-left: auto;
+    margin-right: auto;
+}
+
+table.contentstable p.biglink {
+    line-height: 150%;
+}
+
+a.biglink {
+    font-size: 1.3em;
+}
+
+span.linkdescr {
+    font-style: italic;
+    padding-top: 5px;
+    font-size: 90%;
+}
+
+/* -- general index --------------------------------------------------------- */
+
+table.indextable {
+    width: 100%;
+}
+
+table.indextable td {
+    text-align: left;
+    vertical-align: top;
+}
+
+table.indextable ul {
+    margin-top: 0;
+    margin-bottom: 0;
+    list-style-type: none;
+}
+
+table.indextable > tbody > tr > td > ul {
+    padding-left: 0em;
+}
+
+table.indextable tr.pcap {
+    height: 10px;
+}
+
+table.indextable tr.cap {
+    margin-top: 10px;
+    background-color: #f2f2f2;
+}
+
+img.toggler {
+    margin-right: 3px;
+    margin-top: 3px;
+    cursor: pointer;
+}
+
+div.modindex-jumpbox {
+    border-top: 1px solid #ddd;
+    border-bottom: 1px solid #ddd;
+    margin: 1em 0 1em 0;
+    padding: 0.4em;
+}
+
+div.genindex-jumpbox {
+    border-top: 1px solid #ddd;
+    border-bottom: 1px solid #ddd;
+    margin: 1em 0 1em 0;
+    padding: 0.4em;
+}
+
+/* -- domain module index --------------------------------------------------- */
+
+table.modindextable td {
+    padding: 2px;
+    border-collapse: collapse;
+}
+
+/* -- general body styles --------------------------------------------------- */
+
+div.body {
+    min-width: 450px;
+    max-width: 800px;
+}
+
+div.body p, div.body dd, div.body li, div.body blockquote {
+    -moz-hyphens: auto;
+    -ms-hyphens: auto;
+    -webkit-hyphens: auto;
+    hyphens: auto;
+}
+
+a.headerlink {
+    visibility: hidden;
+}
+
+h1:hover > a.headerlink,
+h2:hover > a.headerlink,
+h3:hover > a.headerlink,
+h4:hover > a.headerlink,
+h5:hover > a.headerlink,
+h6:hover > a.headerlink,
+dt:hover > a.headerlink,
+caption:hover > a.headerlink,
+p.caption:hover > a.headerlink,
+div.code-block-caption:hover > a.headerlink {
+    visibility: visible;
+}
+
+div.body p.caption {
+    text-align: inherit;
+}
+
+div.body td {
+    text-align: left;
+}
+
+.first {
+    margin-top: 0 !important;
+}
+
+p.rubric {
+    margin-top: 30px;
+    font-weight: bold;
+}
+
+img.align-left, .figure.align-left, object.align-left {
+    clear: left;
+    float: left;
+    margin-right: 1em;
+}
+
+img.align-right, .figure.align-right, object.align-right {
+    clear: right;
+    float: right;
+    margin-left: 1em;
+}
+
+img.align-center, .figure.align-center, object.align-center {
+  display: block;
+  margin-left: auto;
+  margin-right: auto;
+}
+
+.align-left {
+    text-align: left;
+}
+
+.align-center {
+    text-align: center;
+}
+
+.align-right {
+    text-align: right;
+}
+
+/* -- sidebars -------------------------------------------------------------- */
+
+div.sidebar {
+    margin: 0 0 0.5em 1em;
+    border: 1px solid #ddb;
+    padding: 7px 7px 0 7px;
+    background-color: #ffe;
+    width: 40%;
+    float: right;
+}
+
+p.sidebar-title {
+    font-weight: bold;
+}
+
+/* -- topics ---------------------------------------------------------------- */
+
+div.topic {
+    border: 1px solid #ccc;
+    padding: 7px 7px 0 7px;
+    margin: 10px 0 10px 0;
+}
+
+p.topic-title {
+    font-size: 1.1em;
+    font-weight: bold;
+    margin-top: 10px;
+}
+
+/* -- admonitions ----------------------------------------------------------- */
+
+div.admonition {
+    margin-top: 10px;
+    margin-bottom: 10px;
+    padding: 7px;
+}
+
+div.admonition dt {
+    font-weight: bold;
+}
+
+div.admonition dl {
+    margin-bottom: 0;
+}
+
+p.admonition-title {
+    margin: 0px 10px 5px 0px;
+    font-weight: bold;
+}
+
+div.body p.centered {
+    text-align: center;
+    margin-top: 25px;
+}
+
+/* -- tables ---------------------------------------------------------------- */
+
+table.docutils {
+    border: 0;
+    border-collapse: collapse;
+}
+
+table.align-center {
+    margin-left: auto;
+    margin-right: auto;
+}
+
+table caption span.caption-number {
+    font-style: italic;
+}
+
+table caption span.caption-text {
+}
+
+table.docutils td, table.docutils th {
+    padding: 1px 8px 1px 5px;
+    border-top: 0;
+    border-left: 0;
+    border-right: 0;
+    border-bottom: 1px solid #aaa;
+}
+
+table.footnote td, table.footnote th {
+    border: 0 !important;
+}
+
+th {
+    text-align: left;
+    padding-right: 5px;
+}
+
+table.citation {
+    border-left: solid 1px gray;
+    margin-left: 1px;
+}
+
+table.citation td {
+    border-bottom: none;
+}
+
+/* -- figures --------------------------------------------------------------- */
+
+div.figure {
+    margin: 0.5em;
+    padding: 0.5em;
+}
+
+div.figure p.caption {
+    padding: 0.3em;
+}
+
+div.figure p.caption span.caption-number {
+    font-style: italic;
+}
+
+div.figure p.caption span.caption-text {
+}
+
+/* -- field list styles ----------------------------------------------------- */
+
+table.field-list td, table.field-list th {
+    border: 0 !important;
+}
+
+.field-list ul {
+    margin: 0;
+    padding-left: 1em;
+}
+
+.field-list p {
+    margin: 0;
+}
+
+.field-name {
+    -moz-hyphens: manual;
+    -ms-hyphens: manual;
+    -webkit-hyphens: manual;
+    hyphens: manual;
+}
+
+/* -- other body styles ----------------------------------------------------- */
+
+ol.arabic {
+    list-style: decimal;
+}
+
+ol.loweralpha {
+    list-style: lower-alpha;
+}
+
+ol.upperalpha {
+    list-style: upper-alpha;
+}
+
+ol.lowerroman {
+    list-style: lower-roman;
+}
+
+ol.upperroman {
+    list-style: upper-roman;
+}
+
+dl {
+    margin-bottom: 15px;
+}
+
+dd p {
+    margin-top: 0px;
+}
+
+dd ul, dd table {
+    margin-bottom: 10px;
+}
+
+dd {
+    margin-top: 3px;
+    margin-bottom: 10px;
+    margin-left: 30px;
+}
+
+dt:target, span.highlighted {
+    background-color: #fbe54e;
+}
+
+rect.highlighted {
+    fill: #fbe54e;
+}
+
+dl.glossary dt {
+    font-weight: bold;
+    font-size: 1.1em;
+}
+
+.optional {
+    font-size: 1.3em;
+}
+
+.sig-paren {
+    font-size: larger;
+}
+
+.versionmodified {
+    font-style: italic;
+}
+
+.system-message {
+    background-color: #fda;
+    padding: 5px;
+    border: 3px solid red;
+}
+
+.footnote:target  {
+    background-color: #ffa;
+}
+
+.line-block {
+    display: block;
+    margin-top: 1em;
+    margin-bottom: 1em;
+}
+
+.line-block .line-block {
+    margin-top: 0;
+    margin-bottom: 0;
+    margin-left: 1.5em;
+}
+
+.guilabel, .menuselection {
+    font-family: sans-serif;
+}
+
+.accelerator {
+    text-decoration: underline;
+}
+
+.classifier {
+    font-style: oblique;
+}
+
+abbr, acronym {
+    border-bottom: dotted 1px;
+    cursor: help;
+}
+
+/* -- code displays --------------------------------------------------------- */
+
+pre {
+    overflow: auto;
+    overflow-y: hidden;  /* fixes display issues on Chrome browsers */
+}
+
+span.pre {
+    -moz-hyphens: none;
+    -ms-hyphens: none;
+    -webkit-hyphens: none;
+    hyphens: none;
+}
+
+td.linenos pre {
+    padding: 5px 0px;
+    border: 0;
+    background-color: transparent;
+    color: #aaa;
+}
+
+table.highlighttable {
+    margin-left: 0.5em;
+}
+
+table.highlighttable td {
+    padding: 0 0.5em 0 0.5em;
+}
+
+div.code-block-caption {
+    padding: 2px 5px;
+    font-size: small;
+}
+
+div.code-block-caption code {
+    background-color: transparent;
+}
+
+div.code-block-caption + div > div.highlight > pre {
+    margin-top: 0;
+}
+
+div.code-block-caption span.caption-number {
+    padding: 0.1em 0.3em;
+    font-style: italic;
+}
+
+div.code-block-caption span.caption-text {
+}
+
+div.literal-block-wrapper {
+    padding: 1em 1em 0;
+}
+
+div.literal-block-wrapper div.highlight {
+    margin: 0;
+}
+
+code.descname {
+    background-color: transparent;
+    font-weight: bold;
+    font-size: 1.2em;
+}
+
+code.descclassname {
+    background-color: transparent;
+}
+
+code.xref, a code {
+    background-color: transparent;
+    font-weight: bold;
+}
+
+h1 code, h2 code, h3 code, h4 code, h5 code, h6 code {
+    background-color: transparent;
+}
+
+.viewcode-link {
+    float: right;
+}
+
+.viewcode-back {
+    float: right;
+    font-family: sans-serif;
+}
+
+div.viewcode-block:target {
+    margin: -1px -10px;
+    padding: 0 10px;
+}
+
+/* -- math display ---------------------------------------------------------- */
+
+img.math {
+    vertical-align: middle;
+}
+
+div.body div.math p {
+    text-align: center;
+}
+
+span.eqno {
+    float: right;
+}
+
+span.eqno a.headerlink {
+    position: relative;
+    left: 0px;
+    z-index: 1;
+}
+
+div.math:hover a.headerlink {
+    visibility: visible;
+}
+
+/* -- printout stylesheet --------------------------------------------------- */
+
+@media print {
+    div.document,
+    div.documentwrapper,
+    div.bodywrapper {
+        margin: 0 !important;
+        width: 100%;
+    }
+
+    div.sphinxsidebar,
+    div.related,
+    div.footer,
+    #top-link {
+        display: none;
+    }
+}
\ No newline at end of file
diff --git a/_static/broken_example.png b/_static/broken_example.png
new file mode 100755
index 00000000000..4fea24e7df4
Binary files /dev/null and b/_static/broken_example.png differ
diff --git a/_static/comment-bright.png b/_static/comment-bright.png
new file mode 100755
index 00000000000..15e27edb12a
Binary files /dev/null and b/_static/comment-bright.png differ
diff --git a/_static/comment-close.png b/_static/comment-close.png
new file mode 100755
index 00000000000..4d91bcf57de
Binary files /dev/null and b/_static/comment-close.png differ
diff --git a/_static/comment.png b/_static/comment.png
new file mode 100755
index 00000000000..dfbc0cbd512
Binary files /dev/null and b/_static/comment.png differ
diff --git a/_static/css/custom.css b/_static/css/custom.css
new file mode 100755
index 00000000000..a0882c1d4fc
--- /dev/null
+++ b/_static/css/custom.css
@@ -0,0 +1,97 @@
+/* sphinx-design styles for cards/tabs
+*/
+
+:root {
+    --sd-color-info: #ee4c2c;
+    --sd-color-primary: #6c6c6d;
+    --sd-color-primary-highlight: #f3f4f7;
+    --sd-color-card-border-hover: #ee4c2c;
+    --sd-color-card-border: #f3f4f7;
+    --sd-color-card-background: #fff;
+    --sd-color-card-text: inherit;
+    --sd-color-card-header: transparent;
+    --sd-color-card-footer: transparent;
+    --sd-color-tabs-label-active: hsla(231, 99%, 66%, 1);
+    --sd-color-tabs-label-hover: hsla(231, 99%, 66%, 1);
+    --sd-color-tabs-label-inactive: hsl(0, 0%, 66%);
+    --sd-color-tabs-underline-active: hsla(231, 99%, 66%, 1);
+    --sd-color-tabs-underline-hover: rgba(178, 206, 245, 0.62);
+    --sd-color-tabs-underline-inactive: transparent;
+    --sd-color-tabs-overline: rgb(222, 222, 222);
+    --sd-color-tabs-underline: rgb(222, 222, 222);
+}
+
+.sd-text-info {
+    color: #ee4c2c;
+}
+
+
+.sd-card {
+    position: relative;
+    background-color: #fff;
+    opacity: 1.0;
+    border-radius: 0px;
+    width: 30%;
+    border: none;
+    padding-bottom: 0px;
+}
+
+
+.sd-card-img {
+    opacity: 0.5;
+    width: 200px;
+    padding: 0px;
+}
+
+.sd-card-img:hover {
+    opacity: 1.0;
+    background-color: #f3f4f7;
+}
+
+
+.sd-card:after {
+    display: block;
+    opacity: 1;
+    content: '';
+    border-bottom: solid 1px #ee4c2c;  
+    background-color: #fff;
+    transform: scaleX(0);  
+    transition: transform .250s ease-in-out;
+    transform-origin:  0% 50%;
+}
+
+.sd-card:hover {
+    background-color: #fff;
+    opacity: 1;
+    border-top: 1px solid #f3f4f7;
+    border-left: 1px solid #f3f4f7;
+    border-right: 1px solid #f3f4f7;
+}
+
+.sd-card:hover:after {
+    transform: scaleX(1);
+}
+
+.card-prerequisites:hover {
+    transition: none;
+    border: none;
+}
+
+.card-prerequisites:hover:after {
+    transition: none;
+    transform: none;
+}
+
+.card-prerequisites:after {
+    display: block;
+    content: '';
+    border-bottom: none;
+    background-color: #fff;
+    transform: none;
+    transition: none;
+    transform-origin: none;
+}
+
+.pytorch-left-menu-search input[type=text] {
+    background-image: url("../images/search-icon.svg");
+}
diff --git a/_static/css/custom2.css b/_static/css/custom2.css
new file mode 100644
index 00000000000..a24ee796872
--- /dev/null
+++ b/_static/css/custom2.css
@@ -0,0 +1,112 @@
+/* Survey banner .css */
+
+.survey-banner {
+  margin-top: 10px;
+  background-color: #f3f4f7;
+  padding-top: 15px;
+  padding-left: 10px;
+  padding-bottom: 1px;
+}
+
+@media screen and (max-width: 600px) {
+  .survey-banner {
+    padding-top: 5px;
+    padding-left: 5px;
+    padding-bottom: -1px;
+    font-size: 12px;
+    margin-bottom: 5px;
+  }
+}
+
+/* Left nav for 2nd level nav */
+
+.pytorch-left-menu li.toctree-l2 {
+  padding-left: 10px;
+}
+
+.pytorch-left-menu li.toctree-l2.current > a, {
+   color: #ee4c2c;
+}
+
+.pytorch-left-menu li.toctree-l2.current a:link.reference.internal {
+   color: #ee4c2c;
+}
+
+.pytorch-left-menu li.toctree-l1.current > a:before {
+   content: "";
+}
+
+/* search radio button*/
+
+input[type="radio"] {
+    accent-color: #ee4c2c;
+}
+
+.gsst_b {
+    display: none;
+}
+
+#gsc-i-id1 {
+    height: 1.5rem;
+    text-indent: 12px !important; 
+    font-size: 1rem !important;
+    font-family: "FreightSansi";
+    background-image: url(../images/search-icon.svg) !important;
+    background-repeat: no-repeat !important;
+    background-size: 18px 18px !important;
+    background-position: 5px 0px !important;
+    padding-left: 20px !important;
+}
+
+#gsc-i-id1::placeholder {
+  font-family: 'FreightSans';
+  font-size: 1rem;
+  color: #262626;
+}
+
+.gsc-control-cse {
+   padding: 0 !important;
+   border-radius: 0px !important;
+   border: none !important;
+}
+
+.gsc-overflow-hidden {
+  overflow: visible !important;
+}
+
+#___gcse_0 {
+    height: 44px !important;
+    padding: 0 !important;
+}
+
+table.gsc-search-box td.gsc-input {
+    padding-right: 0 !important;
+}
+
+table.gsc-search-box td {
+    height: 44px;
+    margin-bottom: 0 !important;
+    padding-bottom: 0 !important;
+}
+
+.gsc-search-button-v2 {
+    display: none;
+}
+
+.gs_id50 {
+    width: 308px;
+}
+
+.gsib_a {
+    padding: 0px 8px 4px 9px !important;
+}
+
+.gsc-input-box {
+    border-radius: 0px !important;
+    border: none !important; 
+}
+
+form.gsc-search-box {
+    margin-bottom 0px;
+}
+
diff --git a/_static/css/pytorch_theme.css b/_static/css/pytorch_theme.css
old mode 100644
new mode 100755
index ea0dd7bf67e..153f4889c08
--- a/_static/css/pytorch_theme.css
+++ b/_static/css/pytorch_theme.css
@@ -94,6 +94,10 @@ a.icon.icon-home {
     color: #D44D2C;
 }
 
+.version{
+    color: #D44D2C !important;
+}
+
 /* Default footer text is quite big */
 footer {
     font-size: 80%;
@@ -108,7 +112,7 @@ footer p {
 }
 
 /* For hidden headers that appear in TOC tree */
-/* see http://stackoverflow.com/a/32363545/3343043 */
+/* see https://stackoverflow.com/a/32363545/3343043 */
 .rst-content .hidden-section {
     display: none;
 }
diff --git a/_static/down-pressed.png b/_static/down-pressed.png
new file mode 100755
index 00000000000..5756c8cad88
Binary files /dev/null and b/_static/down-pressed.png differ
diff --git a/_static/down.png b/_static/down.png
new file mode 100755
index 00000000000..1b3bdad2cef
Binary files /dev/null and b/_static/down.png differ
diff --git a/_static/file.png b/_static/file.png
new file mode 100755
index 00000000000..a858a410e4f
Binary files /dev/null and b/_static/file.png differ
diff --git a/_static/fonts/FreightSans/freight-sans-light.woff b/_static/fonts/FreightSans/freight-sans-light.woff
new file mode 100755
index 00000000000..e67ed72a13a
Binary files /dev/null and b/_static/fonts/FreightSans/freight-sans-light.woff differ
diff --git a/_static/fonts/FreightSans/freight-sans-light.woff2 b/_static/fonts/FreightSans/freight-sans-light.woff2
new file mode 100755
index 00000000000..5a6b1890aeb
Binary files /dev/null and b/_static/fonts/FreightSans/freight-sans-light.woff2 differ
diff --git a/_static/fonts/FreightSans/freight-sans-regular.woff b/_static/fonts/FreightSans/freight-sans-regular.woff
new file mode 100755
index 00000000000..7e39281921d
Binary files /dev/null and b/_static/fonts/FreightSans/freight-sans-regular.woff differ
diff --git a/_static/fonts/FreightSans/freight-sans-regular.woff2 b/_static/fonts/FreightSans/freight-sans-regular.woff2
new file mode 100755
index 00000000000..e77a2cc2c5e
Binary files /dev/null and b/_static/fonts/FreightSans/freight-sans-regular.woff2 differ
diff --git a/_static/fonts/IBMPlexMono/IBMPlexMono-Light.woff b/_static/fonts/IBMPlexMono/IBMPlexMono-Light.woff
new file mode 100755
index 00000000000..cf37a5c50bd
Binary files /dev/null and b/_static/fonts/IBMPlexMono/IBMPlexMono-Light.woff differ
diff --git a/_static/fonts/IBMPlexMono/IBMPlexMono-Light.woff2 b/_static/fonts/IBMPlexMono/IBMPlexMono-Light.woff2
new file mode 100755
index 00000000000..955a6eab5bb
Binary files /dev/null and b/_static/fonts/IBMPlexMono/IBMPlexMono-Light.woff2 differ
diff --git a/_static/fonts/IBMPlexMono/IBMPlexMono-Medium.woff b/_static/fonts/IBMPlexMono/IBMPlexMono-Medium.woff
new file mode 100755
index 00000000000..fc65a679c22
Binary files /dev/null and b/_static/fonts/IBMPlexMono/IBMPlexMono-Medium.woff differ
diff --git a/_static/fonts/IBMPlexMono/IBMPlexMono-Medium.woff2 b/_static/fonts/IBMPlexMono/IBMPlexMono-Medium.woff2
new file mode 100755
index 00000000000..c352e40e34a
Binary files /dev/null and b/_static/fonts/IBMPlexMono/IBMPlexMono-Medium.woff2 differ
diff --git a/_static/fonts/IBMPlexMono/IBMPlexMono-Regular.woff b/_static/fonts/IBMPlexMono/IBMPlexMono-Regular.woff
new file mode 100755
index 00000000000..7d63d89f24b
Binary files /dev/null and b/_static/fonts/IBMPlexMono/IBMPlexMono-Regular.woff differ
diff --git a/_static/fonts/IBMPlexMono/IBMPlexMono-Regular.woff2 b/_static/fonts/IBMPlexMono/IBMPlexMono-Regular.woff2
new file mode 100755
index 00000000000..d0d7ded9079
Binary files /dev/null and b/_static/fonts/IBMPlexMono/IBMPlexMono-Regular.woff2 differ
diff --git a/_static/fonts/IBMPlexMono/IBMPlexMono-SemiBold.woff b/_static/fonts/IBMPlexMono/IBMPlexMono-SemiBold.woff
new file mode 100755
index 00000000000..1da7753cf28
Binary files /dev/null and b/_static/fonts/IBMPlexMono/IBMPlexMono-SemiBold.woff differ
diff --git a/_static/fonts/IBMPlexMono/IBMPlexMono-SemiBold.woff2 b/_static/fonts/IBMPlexMono/IBMPlexMono-SemiBold.woff2
new file mode 100755
index 00000000000..79dffdb85f7
Binary files /dev/null and b/_static/fonts/IBMPlexMono/IBMPlexMono-SemiBold.woff2 differ
diff --git a/_static/gallery.css b/_static/gallery.css
new file mode 100755
index 00000000000..07caf0e7090
--- /dev/null
+++ b/_static/gallery.css
@@ -0,0 +1,189 @@
+/*
+Sphinx-Gallery has compatible CSS to fix default sphinx themes
+Tested for Sphinx 1.3.1 for all themes: default, alabaster, sphinxdoc,
+scrolls, agogo, traditional, nature, haiku, pyramid
+Tested for Read the Docs theme 0.1.7 */
+.sphx-glr-thumbcontainer {
+  background: #fff;
+  border: solid #fff 1px;
+  -moz-border-radius: 5px;
+  -webkit-border-radius: 5px;
+  border-radius: 5px;
+  box-shadow: none;
+  float: left;
+  margin: 5px;
+  min-height: 230px;
+  padding-top: 5px;
+  position: relative;
+}
+.sphx-glr-thumbcontainer:hover {
+  border: solid #b4ddfc 1px;
+  box-shadow: 0 0 15px rgba(142, 176, 202, 0.5);
+}
+.sphx-glr-thumbcontainer a.internal {
+  bottom: 0;
+  display: block;
+  left: 0;
+  padding: 150px 10px 0;
+  position: absolute;
+  right: 0;
+  top: 0;
+}
+/* Next one is to avoid Sphinx traditional theme to cover all the
+thumbnail with its default link Background color */
+.sphx-glr-thumbcontainer a.internal:hover {
+  background-color: transparent;
+}
+
+.sphx-glr-thumbcontainer p {
+  margin: 0 0 .1em 0;
+}
+.sphx-glr-thumbcontainer .figure {
+  margin: 10px;
+  width: 160px;
+}
+.sphx-glr-thumbcontainer img {
+  display: inline;
+  max-height: 160px;
+  width: 160px;
+}
+.sphx-glr-thumbcontainer[tooltip]:hover:after {
+  background: rgba(0, 0, 0, 0.8);
+  -webkit-border-radius: 5px;
+  -moz-border-radius: 5px;
+  border-radius: 5px;
+  color: #fff;
+  content: attr(tooltip);
+  left: 95%;
+  padding: 5px 15px;
+  position: absolute;
+  z-index: 98;
+  width: 220px;
+  bottom: 52%;
+}
+.sphx-glr-thumbcontainer[tooltip]:hover:before {
+  border: solid;
+  border-color: #333 transparent;
+  border-width: 18px 0 0 20px;
+  bottom: 58%;
+  content: '';
+  left: 85%;
+  position: absolute;
+  z-index: 99;
+}
+
+.highlight-pytb pre {
+  background-color: #ffe4e4;
+  border: 1px solid #f66;
+  margin-top: 10px;
+  padding: 7px;
+}
+
+.sphx-glr-script-out {
+  color: #888;
+  margin: 0;
+}
+.sphx-glr-script-out .highlight {
+  background-color: transparent;
+  margin-left: 2.5em;
+  margin-top: -1.4em;
+}
+.sphx-glr-script-out .highlight pre {
+  background-color: #fafae2;
+  border: 0;
+  max-height: 30em;
+  overflow: auto;
+  padding-left: 1ex;
+  margin: 0px;
+  word-break: break-word;
+}
+.sphx-glr-script-out + p {
+  margin-top: 1.8em;
+}
+blockquote.sphx-glr-script-out {
+  margin-left: 0pt;
+}
+
+div.sphx-glr-footer {
+    text-align: center;
+}
+
+div.binder-badge {
+  margin: 1em auto;
+  vertical-align: middle;
+}
+
+div.sphx-glr-download {
+  margin: 1em auto;
+  vertical-align: middle;
+}
+
+div.sphx-glr-download a {
+  background-color: #ffc;
+  background-image: linear-gradient(to bottom, #FFC, #d5d57e);
+  border-radius: 4px;
+  border: 1px solid #c2c22d;
+  color: #000;
+  display: inline-block;
+  font-weight: bold;
+  padding: 1ex;
+  text-align: center;
+}
+
+div.sphx-glr-download code.download {
+  display: inline-block;
+  white-space: normal;
+  word-break: normal;
+  overflow-wrap: break-word;
+  /* border and background are given by the enclosing 'a' */
+  border: none;
+  background: none;
+}
+
+div.sphx-glr-download a:hover {
+  box-shadow: inset 0 1px 0 rgba(255,255,255,.1), 0 1px 5px rgba(0,0,0,.25);
+  text-decoration: none;
+  background-image: none;
+  background-color: #d5d57e;
+}
+
+.sphx-glr-example-title > :target::before {
+  display: block;
+  content: "";
+  margin-top: -50px;
+  height: 50px;
+  visibility: hidden;
+}
+
+ul.sphx-glr-horizontal {
+  list-style: none;
+  padding: 0;
+}
+ul.sphx-glr-horizontal li {
+  display: inline;
+}
+ul.sphx-glr-horizontal img {
+  height: auto !important;
+}
+
+.sphx-glr-single-img {
+  margin: auto;
+  display: block;
+  max-width: 100%;
+}
+
+.sphx-glr-multi-img {
+  max-width: 42%;
+  height: auto;
+}
+
+p.sphx-glr-signature a.reference.external {
+  -moz-border-radius: 5px;
+  -webkit-border-radius: 5px;
+  border-radius: 5px;
+  padding: 3px;
+  font-size: 75%;
+  text-align: right;
+  margin-left: auto;
+  display: table;
+}
diff --git a/_static/imagenet_class_index.json b/_static/imagenet_class_index.json
new file mode 100644
index 00000000000..5fe0dfefcd3
--- /dev/null
+++ b/_static/imagenet_class_index.json
@@ -0,0 +1 @@
+{"0": ["n01440764", "tench"], "1": ["n01443537", "goldfish"], "2": ["n01484850", "great_white_shark"], "3": ["n01491361", "tiger_shark"], "4": ["n01494475", "hammerhead"], "5": ["n01496331", "electric_ray"], "6": ["n01498041", "stingray"], "7": ["n01514668", "cock"], "8": ["n01514859", "hen"], "9": ["n01518878", "ostrich"], "10": ["n01530575", "brambling"], "11": ["n01531178", "goldfinch"], "12": ["n01532829", "house_finch"], "13": ["n01534433", "junco"], "14": ["n01537544", "indigo_bunting"], "15": ["n01558993", "robin"], "16": ["n01560419", "bulbul"], "17": ["n01580077", "jay"], "18": ["n01582220", "magpie"], "19": ["n01592084", "chickadee"], "20": ["n01601694", "water_ouzel"], "21": ["n01608432", "kite"], "22": ["n01614925", "bald_eagle"], "23": ["n01616318", "vulture"], "24": ["n01622779", "great_grey_owl"], "25": ["n01629819", "European_fire_salamander"], "26": ["n01630670", "common_newt"], "27": ["n01631663", "eft"], "28": ["n01632458", "spotted_salamander"], "29": ["n01632777", "axolotl"], "30": ["n01641577", "bullfrog"], "31": ["n01644373", "tree_frog"], "32": ["n01644900", "tailed_frog"], "33": ["n01664065", "loggerhead"], "34": ["n01665541", "leatherback_turtle"], "35": ["n01667114", "mud_turtle"], "36": ["n01667778", "terrapin"], "37": ["n01669191", "box_turtle"], "38": ["n01675722", "banded_gecko"], "39": ["n01677366", "common_iguana"], "40": ["n01682714", "American_chameleon"], "41": ["n01685808", "whiptail"], "42": ["n01687978", "agama"], "43": ["n01688243", "frilled_lizard"], "44": ["n01689811", "alligator_lizard"], "45": ["n01692333", "Gila_monster"], "46": ["n01693334", "green_lizard"], "47": ["n01694178", "African_chameleon"], "48": ["n01695060", "Komodo_dragon"], "49": ["n01697457", "African_crocodile"], "50": ["n01698640", "American_alligator"], "51": ["n01704323", "triceratops"], "52": ["n01728572", "thunder_snake"], "53": ["n01728920", "ringneck_snake"], "54": ["n01729322", "hognose_snake"], "55": ["n01729977", "green_snake"], "56": ["n01734418", "king_snake"], "57": ["n01735189", "garter_snake"], "58": ["n01737021", "water_snake"], "59": ["n01739381", "vine_snake"], "60": ["n01740131", "night_snake"], "61": ["n01742172", "boa_constrictor"], "62": ["n01744401", "rock_python"], "63": ["n01748264", "Indian_cobra"], "64": ["n01749939", "green_mamba"], "65": ["n01751748", "sea_snake"], "66": ["n01753488", "horned_viper"], "67": ["n01755581", "diamondback"], "68": ["n01756291", "sidewinder"], "69": ["n01768244", "trilobite"], "70": ["n01770081", "harvestman"], "71": ["n01770393", "scorpion"], "72": ["n01773157", "black_and_gold_garden_spider"], "73": ["n01773549", "barn_spider"], "74": ["n01773797", "garden_spider"], "75": ["n01774384", "black_widow"], "76": ["n01774750", "tarantula"], "77": ["n01775062", "wolf_spider"], "78": ["n01776313", "tick"], "79": ["n01784675", "centipede"], "80": ["n01795545", "black_grouse"], "81": ["n01796340", "ptarmigan"], "82": ["n01797886", "ruffed_grouse"], "83": ["n01798484", "prairie_chicken"], "84": ["n01806143", "peacock"], "85": ["n01806567", "quail"], "86": ["n01807496", "partridge"], "87": ["n01817953", "African_grey"], "88": ["n01818515", "macaw"], "89": ["n01819313", "sulphur-crested_cockatoo"], "90": ["n01820546", "lorikeet"], "91": ["n01824575", "coucal"], "92": ["n01828970", "bee_eater"], "93": ["n01829413", "hornbill"], "94": ["n01833805", "hummingbird"], "95": ["n01843065", "jacamar"], "96": ["n01843383", "toucan"], "97": ["n01847000", "drake"], "98": ["n01855032", "red-breasted_merganser"], "99": ["n01855672", "goose"], "100": ["n01860187", "black_swan"], "101": ["n01871265", "tusker"], "102": ["n01872401", "echidna"], "103": ["n01873310", "platypus"], "104": ["n01877812", "wallaby"], "105": ["n01882714", "koala"], "106": ["n01883070", "wombat"], "107": ["n01910747", "jellyfish"], "108": ["n01914609", "sea_anemone"], "109": ["n01917289", "brain_coral"], "110": ["n01924916", "flatworm"], "111": ["n01930112", "nematode"], "112": ["n01943899", "conch"], "113": ["n01944390", "snail"], "114": ["n01945685", "slug"], "115": ["n01950731", "sea_slug"], "116": ["n01955084", "chiton"], "117": ["n01968897", "chambered_nautilus"], "118": ["n01978287", "Dungeness_crab"], "119": ["n01978455", "rock_crab"], "120": ["n01980166", "fiddler_crab"], "121": ["n01981276", "king_crab"], "122": ["n01983481", "American_lobster"], "123": ["n01984695", "spiny_lobster"], "124": ["n01985128", "crayfish"], "125": ["n01986214", "hermit_crab"], "126": ["n01990800", "isopod"], "127": ["n02002556", "white_stork"], "128": ["n02002724", "black_stork"], "129": ["n02006656", "spoonbill"], "130": ["n02007558", "flamingo"], "131": ["n02009229", "little_blue_heron"], "132": ["n02009912", "American_egret"], "133": ["n02011460", "bittern"], "134": ["n02012849", "crane"], "135": ["n02013706", "limpkin"], "136": ["n02017213", "European_gallinule"], "137": ["n02018207", "American_coot"], "138": ["n02018795", "bustard"], "139": ["n02025239", "ruddy_turnstone"], "140": ["n02027492", "red-backed_sandpiper"], "141": ["n02028035", "redshank"], "142": ["n02033041", "dowitcher"], "143": ["n02037110", "oystercatcher"], "144": ["n02051845", "pelican"], "145": ["n02056570", "king_penguin"], "146": ["n02058221", "albatross"], "147": ["n02066245", "grey_whale"], "148": ["n02071294", "killer_whale"], "149": ["n02074367", "dugong"], "150": ["n02077923", "sea_lion"], "151": ["n02085620", "Chihuahua"], "152": ["n02085782", "Japanese_spaniel"], "153": ["n02085936", "Maltese_dog"], "154": ["n02086079", "Pekinese"], "155": ["n02086240", "Shih-Tzu"], "156": ["n02086646", "Blenheim_spaniel"], "157": ["n02086910", "papillon"], "158": ["n02087046", "toy_terrier"], "159": ["n02087394", "Rhodesian_ridgeback"], "160": ["n02088094", "Afghan_hound"], "161": ["n02088238", "basset"], "162": ["n02088364", "beagle"], "163": ["n02088466", "bloodhound"], "164": ["n02088632", "bluetick"], "165": ["n02089078", "black-and-tan_coonhound"], "166": ["n02089867", "Walker_hound"], "167": ["n02089973", "English_foxhound"], "168": ["n02090379", "redbone"], "169": ["n02090622", "borzoi"], "170": ["n02090721", "Irish_wolfhound"], "171": ["n02091032", "Italian_greyhound"], "172": ["n02091134", "whippet"], "173": ["n02091244", "Ibizan_hound"], "174": ["n02091467", "Norwegian_elkhound"], "175": ["n02091635", "otterhound"], "176": ["n02091831", "Saluki"], "177": ["n02092002", "Scottish_deerhound"], "178": ["n02092339", "Weimaraner"], "179": ["n02093256", "Staffordshire_bullterrier"], "180": ["n02093428", "American_Staffordshire_terrier"], "181": ["n02093647", "Bedlington_terrier"], "182": ["n02093754", "Border_terrier"], "183": ["n02093859", "Kerry_blue_terrier"], "184": ["n02093991", "Irish_terrier"], "185": ["n02094114", "Norfolk_terrier"], "186": ["n02094258", "Norwich_terrier"], "187": ["n02094433", "Yorkshire_terrier"], "188": ["n02095314", "wire-haired_fox_terrier"], "189": ["n02095570", "Lakeland_terrier"], "190": ["n02095889", "Sealyham_terrier"], "191": ["n02096051", "Airedale"], "192": ["n02096177", "cairn"], "193": ["n02096294", "Australian_terrier"], "194": ["n02096437", "Dandie_Dinmont"], "195": ["n02096585", "Boston_bull"], "196": ["n02097047", "miniature_schnauzer"], "197": ["n02097130", "giant_schnauzer"], "198": ["n02097209", "standard_schnauzer"], "199": ["n02097298", "Scotch_terrier"], "200": ["n02097474", "Tibetan_terrier"], "201": ["n02097658", "silky_terrier"], "202": ["n02098105", "soft-coated_wheaten_terrier"], "203": ["n02098286", "West_Highland_white_terrier"], "204": ["n02098413", "Lhasa"], "205": ["n02099267", "flat-coated_retriever"], "206": ["n02099429", "curly-coated_retriever"], "207": ["n02099601", "golden_retriever"], "208": ["n02099712", "Labrador_retriever"], "209": ["n02099849", "Chesapeake_Bay_retriever"], "210": ["n02100236", "German_short-haired_pointer"], "211": ["n02100583", "vizsla"], "212": ["n02100735", "English_setter"], "213": ["n02100877", "Irish_setter"], "214": ["n02101006", "Gordon_setter"], "215": ["n02101388", "Brittany_spaniel"], "216": ["n02101556", "clumber"], "217": ["n02102040", "English_springer"], "218": ["n02102177", "Welsh_springer_spaniel"], "219": ["n02102318", "cocker_spaniel"], "220": ["n02102480", "Sussex_spaniel"], "221": ["n02102973", "Irish_water_spaniel"], "222": ["n02104029", "kuvasz"], "223": ["n02104365", "schipperke"], "224": ["n02105056", "groenendael"], "225": ["n02105162", "malinois"], "226": ["n02105251", "briard"], "227": ["n02105412", "kelpie"], "228": ["n02105505", "komondor"], "229": ["n02105641", "Old_English_sheepdog"], "230": ["n02105855", "Shetland_sheepdog"], "231": ["n02106030", "collie"], "232": ["n02106166", "Border_collie"], "233": ["n02106382", "Bouvier_des_Flandres"], "234": ["n02106550", "Rottweiler"], "235": ["n02106662", "German_shepherd"], "236": ["n02107142", "Doberman"], "237": ["n02107312", "miniature_pinscher"], "238": ["n02107574", "Greater_Swiss_Mountain_dog"], "239": ["n02107683", "Bernese_mountain_dog"], "240": ["n02107908", "Appenzeller"], "241": ["n02108000", "EntleBucher"], "242": ["n02108089", "boxer"], "243": ["n02108422", "bull_mastiff"], "244": ["n02108551", "Tibetan_mastiff"], "245": ["n02108915", "French_bulldog"], "246": ["n02109047", "Great_Dane"], "247": ["n02109525", "Saint_Bernard"], "248": ["n02109961", "Eskimo_dog"], "249": ["n02110063", "malamute"], "250": ["n02110185", "Siberian_husky"], "251": ["n02110341", "dalmatian"], "252": ["n02110627", "affenpinscher"], "253": ["n02110806", "basenji"], "254": ["n02110958", "pug"], "255": ["n02111129", "Leonberg"], "256": ["n02111277", "Newfoundland"], "257": ["n02111500", "Great_Pyrenees"], "258": ["n02111889", "Samoyed"], "259": ["n02112018", "Pomeranian"], "260": ["n02112137", "chow"], "261": ["n02112350", "keeshond"], "262": ["n02112706", "Brabancon_griffon"], "263": ["n02113023", "Pembroke"], "264": ["n02113186", "Cardigan"], "265": ["n02113624", "toy_poodle"], "266": ["n02113712", "miniature_poodle"], "267": ["n02113799", "standard_poodle"], "268": ["n02113978", "Mexican_hairless"], "269": ["n02114367", "timber_wolf"], "270": ["n02114548", "white_wolf"], "271": ["n02114712", "red_wolf"], "272": ["n02114855", "coyote"], "273": ["n02115641", "dingo"], "274": ["n02115913", "dhole"], "275": ["n02116738", "African_hunting_dog"], "276": ["n02117135", "hyena"], "277": ["n02119022", "red_fox"], "278": ["n02119789", "kit_fox"], "279": ["n02120079", "Arctic_fox"], "280": ["n02120505", "grey_fox"], "281": ["n02123045", "tabby"], "282": ["n02123159", "tiger_cat"], "283": ["n02123394", "Persian_cat"], "284": ["n02123597", "Siamese_cat"], "285": ["n02124075", "Egyptian_cat"], "286": ["n02125311", "cougar"], "287": ["n02127052", "lynx"], "288": ["n02128385", "leopard"], "289": ["n02128757", "snow_leopard"], "290": ["n02128925", "jaguar"], "291": ["n02129165", "lion"], "292": ["n02129604", "tiger"], "293": ["n02130308", "cheetah"], "294": ["n02132136", "brown_bear"], "295": ["n02133161", "American_black_bear"], "296": ["n02134084", "ice_bear"], "297": ["n02134418", "sloth_bear"], "298": ["n02137549", "mongoose"], "299": ["n02138441", "meerkat"], "300": ["n02165105", "tiger_beetle"], "301": ["n02165456", "ladybug"], "302": ["n02167151", "ground_beetle"], "303": ["n02168699", "long-horned_beetle"], "304": ["n02169497", "leaf_beetle"], "305": ["n02172182", "dung_beetle"], "306": ["n02174001", "rhinoceros_beetle"], "307": ["n02177972", "weevil"], "308": ["n02190166", "fly"], "309": ["n02206856", "bee"], "310": ["n02219486", "ant"], "311": ["n02226429", "grasshopper"], "312": ["n02229544", "cricket"], "313": ["n02231487", "walking_stick"], "314": ["n02233338", "cockroach"], "315": ["n02236044", "mantis"], "316": ["n02256656", "cicada"], "317": ["n02259212", "leafhopper"], "318": ["n02264363", "lacewing"], "319": ["n02268443", "dragonfly"], "320": ["n02268853", "damselfly"], "321": ["n02276258", "admiral"], "322": ["n02277742", "ringlet"], "323": ["n02279972", "monarch"], "324": ["n02280649", "cabbage_butterfly"], "325": ["n02281406", "sulphur_butterfly"], "326": ["n02281787", "lycaenid"], "327": ["n02317335", "starfish"], "328": ["n02319095", "sea_urchin"], "329": ["n02321529", "sea_cucumber"], "330": ["n02325366", "wood_rabbit"], "331": ["n02326432", "hare"], "332": ["n02328150", "Angora"], "333": ["n02342885", "hamster"], "334": ["n02346627", "porcupine"], "335": ["n02356798", "fox_squirrel"], "336": ["n02361337", "marmot"], "337": ["n02363005", "beaver"], "338": ["n02364673", "guinea_pig"], "339": ["n02389026", "sorrel"], "340": ["n02391049", "zebra"], "341": ["n02395406", "hog"], "342": ["n02396427", "wild_boar"], "343": ["n02397096", "warthog"], "344": ["n02398521", "hippopotamus"], "345": ["n02403003", "ox"], "346": ["n02408429", "water_buffalo"], "347": ["n02410509", "bison"], "348": ["n02412080", "ram"], "349": ["n02415577", "bighorn"], "350": ["n02417914", "ibex"], "351": ["n02422106", "hartebeest"], "352": ["n02422699", "impala"], "353": ["n02423022", "gazelle"], "354": ["n02437312", "Arabian_camel"], "355": ["n02437616", "llama"], "356": ["n02441942", "weasel"], "357": ["n02442845", "mink"], "358": ["n02443114", "polecat"], "359": ["n02443484", "black-footed_ferret"], "360": ["n02444819", "otter"], "361": ["n02445715", "skunk"], "362": ["n02447366", "badger"], "363": ["n02454379", "armadillo"], "364": ["n02457408", "three-toed_sloth"], "365": ["n02480495", "orangutan"], "366": ["n02480855", "gorilla"], "367": ["n02481823", "chimpanzee"], "368": ["n02483362", "gibbon"], "369": ["n02483708", "siamang"], "370": ["n02484975", "guenon"], "371": ["n02486261", "patas"], "372": ["n02486410", "baboon"], "373": ["n02487347", "macaque"], "374": ["n02488291", "langur"], "375": ["n02488702", "colobus"], "376": ["n02489166", "proboscis_monkey"], "377": ["n02490219", "marmoset"], "378": ["n02492035", "capuchin"], "379": ["n02492660", "howler_monkey"], "380": ["n02493509", "titi"], "381": ["n02493793", "spider_monkey"], "382": ["n02494079", "squirrel_monkey"], "383": ["n02497673", "Madagascar_cat"], "384": ["n02500267", "indri"], "385": ["n02504013", "Indian_elephant"], "386": ["n02504458", "African_elephant"], "387": ["n02509815", "lesser_panda"], "388": ["n02510455", "giant_panda"], "389": ["n02514041", "barracouta"], "390": ["n02526121", "eel"], "391": ["n02536864", "coho"], "392": ["n02606052", "rock_beauty"], "393": ["n02607072", "anemone_fish"], "394": ["n02640242", "sturgeon"], "395": ["n02641379", "gar"], "396": ["n02643566", "lionfish"], "397": ["n02655020", "puffer"], "398": ["n02666196", "abacus"], "399": ["n02667093", "abaya"], "400": ["n02669723", "academic_gown"], "401": ["n02672831", "accordion"], "402": ["n02676566", "acoustic_guitar"], "403": ["n02687172", "aircraft_carrier"], "404": ["n02690373", "airliner"], "405": ["n02692877", "airship"], "406": ["n02699494", "altar"], "407": ["n02701002", "ambulance"], "408": ["n02704792", "amphibian"], "409": ["n02708093", "analog_clock"], "410": ["n02727426", "apiary"], "411": ["n02730930", "apron"], "412": ["n02747177", "ashcan"], "413": ["n02749479", "assault_rifle"], "414": ["n02769748", "backpack"], "415": ["n02776631", "bakery"], "416": ["n02777292", "balance_beam"], "417": ["n02782093", "balloon"], "418": ["n02783161", "ballpoint"], "419": ["n02786058", "Band_Aid"], "420": ["n02787622", "banjo"], "421": ["n02788148", "bannister"], "422": ["n02790996", "barbell"], "423": ["n02791124", "barber_chair"], "424": ["n02791270", "barbershop"], "425": ["n02793495", "barn"], "426": ["n02794156", "barometer"], "427": ["n02795169", "barrel"], "428": ["n02797295", "barrow"], "429": ["n02799071", "baseball"], "430": ["n02802426", "basketball"], "431": ["n02804414", "bassinet"], "432": ["n02804610", "bassoon"], "433": ["n02807133", "bathing_cap"], "434": ["n02808304", "bath_towel"], "435": ["n02808440", "bathtub"], "436": ["n02814533", "beach_wagon"], "437": ["n02814860", "beacon"], "438": ["n02815834", "beaker"], "439": ["n02817516", "bearskin"], "440": ["n02823428", "beer_bottle"], "441": ["n02823750", "beer_glass"], "442": ["n02825657", "bell_cote"], "443": ["n02834397", "bib"], "444": ["n02835271", "bicycle-built-for-two"], "445": ["n02837789", "bikini"], "446": ["n02840245", "binder"], "447": ["n02841315", "binoculars"], "448": ["n02843684", "birdhouse"], "449": ["n02859443", "boathouse"], "450": ["n02860847", "bobsled"], "451": ["n02865351", "bolo_tie"], "452": ["n02869837", "bonnet"], "453": ["n02870880", "bookcase"], "454": ["n02871525", "bookshop"], "455": ["n02877765", "bottlecap"], "456": ["n02879718", "bow"], "457": ["n02883205", "bow_tie"], "458": ["n02892201", "brass"], "459": ["n02892767", "brassiere"], "460": ["n02894605", "breakwater"], "461": ["n02895154", "breastplate"], "462": ["n02906734", "broom"], "463": ["n02909870", "bucket"], "464": ["n02910353", "buckle"], "465": ["n02916936", "bulletproof_vest"], "466": ["n02917067", "bullet_train"], "467": ["n02927161", "butcher_shop"], "468": ["n02930766", "cab"], "469": ["n02939185", "caldron"], "470": ["n02948072", "candle"], "471": ["n02950826", "cannon"], "472": ["n02951358", "canoe"], "473": ["n02951585", "can_opener"], "474": ["n02963159", "cardigan"], "475": ["n02965783", "car_mirror"], "476": ["n02966193", "carousel"], "477": ["n02966687", "carpenter's_kit"], "478": ["n02971356", "carton"], "479": ["n02974003", "car_wheel"], "480": ["n02977058", "cash_machine"], "481": ["n02978881", "cassette"], "482": ["n02979186", "cassette_player"], "483": ["n02980441", "castle"], "484": ["n02981792", "catamaran"], "485": ["n02988304", "CD_player"], "486": ["n02992211", "cello"], "487": ["n02992529", "cellular_telephone"], "488": ["n02999410", "chain"], "489": ["n03000134", "chainlink_fence"], "490": ["n03000247", "chain_mail"], "491": ["n03000684", "chain_saw"], "492": ["n03014705", "chest"], "493": ["n03016953", "chiffonier"], "494": ["n03017168", "chime"], "495": ["n03018349", "china_cabinet"], "496": ["n03026506", "Christmas_stocking"], "497": ["n03028079", "church"], "498": ["n03032252", "cinema"], "499": ["n03041632", "cleaver"], "500": ["n03042490", "cliff_dwelling"], "501": ["n03045698", "cloak"], "502": ["n03047690", "clog"], "503": ["n03062245", "cocktail_shaker"], "504": ["n03063599", "coffee_mug"], "505": ["n03063689", "coffeepot"], "506": ["n03065424", "coil"], "507": ["n03075370", "combination_lock"], "508": ["n03085013", "computer_keyboard"], "509": ["n03089624", "confectionery"], "510": ["n03095699", "container_ship"], "511": ["n03100240", "convertible"], "512": ["n03109150", "corkscrew"], "513": ["n03110669", "cornet"], "514": ["n03124043", "cowboy_boot"], "515": ["n03124170", "cowboy_hat"], "516": ["n03125729", "cradle"], "517": ["n03126707", "crane"], "518": ["n03127747", "crash_helmet"], "519": ["n03127925", "crate"], "520": ["n03131574", "crib"], "521": ["n03133878", "Crock_Pot"], "522": ["n03134739", "croquet_ball"], "523": ["n03141823", "crutch"], "524": ["n03146219", "cuirass"], "525": ["n03160309", "dam"], "526": ["n03179701", "desk"], "527": ["n03180011", "desktop_computer"], "528": ["n03187595", "dial_telephone"], "529": ["n03188531", "diaper"], "530": ["n03196217", "digital_clock"], "531": ["n03197337", "digital_watch"], "532": ["n03201208", "dining_table"], "533": ["n03207743", "dishrag"], "534": ["n03207941", "dishwasher"], "535": ["n03208938", "disk_brake"], "536": ["n03216828", "dock"], "537": ["n03218198", "dogsled"], "538": ["n03220513", "dome"], "539": ["n03223299", "doormat"], "540": ["n03240683", "drilling_platform"], "541": ["n03249569", "drum"], "542": ["n03250847", "drumstick"], "543": ["n03255030", "dumbbell"], "544": ["n03259280", "Dutch_oven"], "545": ["n03271574", "electric_fan"], "546": ["n03272010", "electric_guitar"], "547": ["n03272562", "electric_locomotive"], "548": ["n03290653", "entertainment_center"], "549": ["n03291819", "envelope"], "550": ["n03297495", "espresso_maker"], "551": ["n03314780", "face_powder"], "552": ["n03325584", "feather_boa"], "553": ["n03337140", "file"], "554": ["n03344393", "fireboat"], "555": ["n03345487", "fire_engine"], "556": ["n03347037", "fire_screen"], "557": ["n03355925", "flagpole"], "558": ["n03372029", "flute"], "559": ["n03376595", "folding_chair"], "560": ["n03379051", "football_helmet"], "561": ["n03384352", "forklift"], "562": ["n03388043", "fountain"], "563": ["n03388183", "fountain_pen"], "564": ["n03388549", "four-poster"], "565": ["n03393912", "freight_car"], "566": ["n03394916", "French_horn"], "567": ["n03400231", "frying_pan"], "568": ["n03404251", "fur_coat"], "569": ["n03417042", "garbage_truck"], "570": ["n03424325", "gasmask"], "571": ["n03425413", "gas_pump"], "572": ["n03443371", "goblet"], "573": ["n03444034", "go-kart"], "574": ["n03445777", "golf_ball"], "575": ["n03445924", "golfcart"], "576": ["n03447447", "gondola"], "577": ["n03447721", "gong"], "578": ["n03450230", "gown"], "579": ["n03452741", "grand_piano"], "580": ["n03457902", "greenhouse"], "581": ["n03459775", "grille"], "582": ["n03461385", "grocery_store"], "583": ["n03467068", "guillotine"], "584": ["n03476684", "hair_slide"], "585": ["n03476991", "hair_spray"], "586": ["n03478589", "half_track"], "587": ["n03481172", "hammer"], "588": ["n03482405", "hamper"], "589": ["n03483316", "hand_blower"], "590": ["n03485407", "hand-held_computer"], "591": ["n03485794", "handkerchief"], "592": ["n03492542", "hard_disc"], "593": ["n03494278", "harmonica"], "594": ["n03495258", "harp"], "595": ["n03496892", "harvester"], "596": ["n03498962", "hatchet"], "597": ["n03527444", "holster"], "598": ["n03529860", "home_theater"], "599": ["n03530642", "honeycomb"], "600": ["n03532672", "hook"], "601": ["n03534580", "hoopskirt"], "602": ["n03535780", "horizontal_bar"], "603": ["n03538406", "horse_cart"], "604": ["n03544143", "hourglass"], "605": ["n03584254", "iPod"], "606": ["n03584829", "iron"], "607": ["n03590841", "jack-o'-lantern"], "608": ["n03594734", "jean"], "609": ["n03594945", "jeep"], "610": ["n03595614", "jersey"], "611": ["n03598930", "jigsaw_puzzle"], "612": ["n03599486", "jinrikisha"], "613": ["n03602883", "joystick"], "614": ["n03617480", "kimono"], "615": ["n03623198", "knee_pad"], "616": ["n03627232", "knot"], "617": ["n03630383", "lab_coat"], "618": ["n03633091", "ladle"], "619": ["n03637318", "lampshade"], "620": ["n03642806", "laptop"], "621": ["n03649909", "lawn_mower"], "622": ["n03657121", "lens_cap"], "623": ["n03658185", "letter_opener"], "624": ["n03661043", "library"], "625": ["n03662601", "lifeboat"], "626": ["n03666591", "lighter"], "627": ["n03670208", "limousine"], "628": ["n03673027", "liner"], "629": ["n03676483", "lipstick"], "630": ["n03680355", "Loafer"], "631": ["n03690938", "lotion"], "632": ["n03691459", "loudspeaker"], "633": ["n03692522", "loupe"], "634": ["n03697007", "lumbermill"], "635": ["n03706229", "magnetic_compass"], "636": ["n03709823", "mailbag"], "637": ["n03710193", "mailbox"], "638": ["n03710637", "maillot"], "639": ["n03710721", "maillot"], "640": ["n03717622", "manhole_cover"], "641": ["n03720891", "maraca"], "642": ["n03721384", "marimba"], "643": ["n03724870", "mask"], "644": ["n03729826", "matchstick"], "645": ["n03733131", "maypole"], "646": ["n03733281", "maze"], "647": ["n03733805", "measuring_cup"], "648": ["n03742115", "medicine_chest"], "649": ["n03743016", "megalith"], "650": ["n03759954", "microphone"], "651": ["n03761084", "microwave"], "652": ["n03763968", "military_uniform"], "653": ["n03764736", "milk_can"], "654": ["n03769881", "minibus"], "655": ["n03770439", "miniskirt"], "656": ["n03770679", "minivan"], "657": ["n03773504", "missile"], "658": ["n03775071", "mitten"], "659": ["n03775546", "mixing_bowl"], "660": ["n03776460", "mobile_home"], "661": ["n03777568", "Model_T"], "662": ["n03777754", "modem"], "663": ["n03781244", "monastery"], "664": ["n03782006", "monitor"], "665": ["n03785016", "moped"], "666": ["n03786901", "mortar"], "667": ["n03787032", "mortarboard"], "668": ["n03788195", "mosque"], "669": ["n03788365", "mosquito_net"], "670": ["n03791053", "motor_scooter"], "671": ["n03792782", "mountain_bike"], "672": ["n03792972", "mountain_tent"], "673": ["n03793489", "mouse"], "674": ["n03794056", "mousetrap"], "675": ["n03796401", "moving_van"], "676": ["n03803284", "muzzle"], "677": ["n03804744", "nail"], "678": ["n03814639", "neck_brace"], "679": ["n03814906", "necklace"], "680": ["n03825788", "nipple"], "681": ["n03832673", "notebook"], "682": ["n03837869", "obelisk"], "683": ["n03838899", "oboe"], "684": ["n03840681", "ocarina"], "685": ["n03841143", "odometer"], "686": ["n03843555", "oil_filter"], "687": ["n03854065", "organ"], "688": ["n03857828", "oscilloscope"], "689": ["n03866082", "overskirt"], "690": ["n03868242", "oxcart"], "691": ["n03868863", "oxygen_mask"], "692": ["n03871628", "packet"], "693": ["n03873416", "paddle"], "694": ["n03874293", "paddlewheel"], "695": ["n03874599", "padlock"], "696": ["n03876231", "paintbrush"], "697": ["n03877472", "pajama"], "698": ["n03877845", "palace"], "699": ["n03884397", "panpipe"], "700": ["n03887697", "paper_towel"], "701": ["n03888257", "parachute"], "702": ["n03888605", "parallel_bars"], "703": ["n03891251", "park_bench"], "704": ["n03891332", "parking_meter"], "705": ["n03895866", "passenger_car"], "706": ["n03899768", "patio"], "707": ["n03902125", "pay-phone"], "708": ["n03903868", "pedestal"], "709": ["n03908618", "pencil_box"], "710": ["n03908714", "pencil_sharpener"], "711": ["n03916031", "perfume"], "712": ["n03920288", "Petri_dish"], "713": ["n03924679", "photocopier"], "714": ["n03929660", "pick"], "715": ["n03929855", "pickelhaube"], "716": ["n03930313", "picket_fence"], "717": ["n03930630", "pickup"], "718": ["n03933933", "pier"], "719": ["n03935335", "piggy_bank"], "720": ["n03937543", "pill_bottle"], "721": ["n03938244", "pillow"], "722": ["n03942813", "ping-pong_ball"], "723": ["n03944341", "pinwheel"], "724": ["n03947888", "pirate"], "725": ["n03950228", "pitcher"], "726": ["n03954731", "plane"], "727": ["n03956157", "planetarium"], "728": ["n03958227", "plastic_bag"], "729": ["n03961711", "plate_rack"], "730": ["n03967562", "plow"], "731": ["n03970156", "plunger"], "732": ["n03976467", "Polaroid_camera"], "733": ["n03976657", "pole"], "734": ["n03977966", "police_van"], "735": ["n03980874", "poncho"], "736": ["n03982430", "pool_table"], "737": ["n03983396", "pop_bottle"], "738": ["n03991062", "pot"], "739": ["n03992509", "potter's_wheel"], "740": ["n03995372", "power_drill"], "741": ["n03998194", "prayer_rug"], "742": ["n04004767", "printer"], "743": ["n04005630", "prison"], "744": ["n04008634", "projectile"], "745": ["n04009552", "projector"], "746": ["n04019541", "puck"], "747": ["n04023962", "punching_bag"], "748": ["n04026417", "purse"], "749": ["n04033901", "quill"], "750": ["n04033995", "quilt"], "751": ["n04037443", "racer"], "752": ["n04039381", "racket"], "753": ["n04040759", "radiator"], "754": ["n04041544", "radio"], "755": ["n04044716", "radio_telescope"], "756": ["n04049303", "rain_barrel"], "757": ["n04065272", "recreational_vehicle"], "758": ["n04067472", "reel"], "759": ["n04069434", "reflex_camera"], "760": ["n04070727", "refrigerator"], "761": ["n04074963", "remote_control"], "762": ["n04081281", "restaurant"], "763": ["n04086273", "revolver"], "764": ["n04090263", "rifle"], "765": ["n04099969", "rocking_chair"], "766": ["n04111531", "rotisserie"], "767": ["n04116512", "rubber_eraser"], "768": ["n04118538", "rugby_ball"], "769": ["n04118776", "rule"], "770": ["n04120489", "running_shoe"], "771": ["n04125021", "safe"], "772": ["n04127249", "safety_pin"], "773": ["n04131690", "saltshaker"], "774": ["n04133789", "sandal"], "775": ["n04136333", "sarong"], "776": ["n04141076", "sax"], "777": ["n04141327", "scabbard"], "778": ["n04141975", "scale"], "779": ["n04146614", "school_bus"], "780": ["n04147183", "schooner"], "781": ["n04149813", "scoreboard"], "782": ["n04152593", "screen"], "783": ["n04153751", "screw"], "784": ["n04154565", "screwdriver"], "785": ["n04162706", "seat_belt"], "786": ["n04179913", "sewing_machine"], "787": ["n04192698", "shield"], "788": ["n04200800", "shoe_shop"], "789": ["n04201297", "shoji"], "790": ["n04204238", "shopping_basket"], "791": ["n04204347", "shopping_cart"], "792": ["n04208210", "shovel"], "793": ["n04209133", "shower_cap"], "794": ["n04209239", "shower_curtain"], "795": ["n04228054", "ski"], "796": ["n04229816", "ski_mask"], "797": ["n04235860", "sleeping_bag"], "798": ["n04238763", "slide_rule"], "799": ["n04239074", "sliding_door"], "800": ["n04243546", "slot"], "801": ["n04251144", "snorkel"], "802": ["n04252077", "snowmobile"], "803": ["n04252225", "snowplow"], "804": ["n04254120", "soap_dispenser"], "805": ["n04254680", "soccer_ball"], "806": ["n04254777", "sock"], "807": ["n04258138", "solar_dish"], "808": ["n04259630", "sombrero"], "809": ["n04263257", "soup_bowl"], "810": ["n04264628", "space_bar"], "811": ["n04265275", "space_heater"], "812": ["n04266014", "space_shuttle"], "813": ["n04270147", "spatula"], "814": ["n04273569", "speedboat"], "815": ["n04275548", "spider_web"], "816": ["n04277352", "spindle"], "817": ["n04285008", "sports_car"], "818": ["n04286575", "spotlight"], "819": ["n04296562", "stage"], "820": ["n04310018", "steam_locomotive"], "821": ["n04311004", "steel_arch_bridge"], "822": ["n04311174", "steel_drum"], "823": ["n04317175", "stethoscope"], "824": ["n04325704", "stole"], "825": ["n04326547", "stone_wall"], "826": ["n04328186", "stopwatch"], "827": ["n04330267", "stove"], "828": ["n04332243", "strainer"], "829": ["n04335435", "streetcar"], "830": ["n04336792", "stretcher"], "831": ["n04344873", "studio_couch"], "832": ["n04346328", "stupa"], "833": ["n04347754", "submarine"], "834": ["n04350905", "suit"], "835": ["n04355338", "sundial"], "836": ["n04355933", "sunglass"], "837": ["n04356056", "sunglasses"], "838": ["n04357314", "sunscreen"], "839": ["n04366367", "suspension_bridge"], "840": ["n04367480", "swab"], "841": ["n04370456", "sweatshirt"], "842": ["n04371430", "swimming_trunks"], "843": ["n04371774", "swing"], "844": ["n04372370", "switch"], "845": ["n04376876", "syringe"], "846": ["n04380533", "table_lamp"], "847": ["n04389033", "tank"], "848": ["n04392985", "tape_player"], "849": ["n04398044", "teapot"], "850": ["n04399382", "teddy"], "851": ["n04404412", "television"], "852": ["n04409515", "tennis_ball"], "853": ["n04417672", "thatch"], "854": ["n04418357", "theater_curtain"], "855": ["n04423845", "thimble"], "856": ["n04428191", "thresher"], "857": ["n04429376", "throne"], "858": ["n04435653", "tile_roof"], "859": ["n04442312", "toaster"], "860": ["n04443257", "tobacco_shop"], "861": ["n04447861", "toilet_seat"], "862": ["n04456115", "torch"], "863": ["n04458633", "totem_pole"], "864": ["n04461696", "tow_truck"], "865": ["n04462240", "toyshop"], "866": ["n04465501", "tractor"], "867": ["n04467665", "trailer_truck"], "868": ["n04476259", "tray"], "869": ["n04479046", "trench_coat"], "870": ["n04482393", "tricycle"], "871": ["n04483307", "trimaran"], "872": ["n04485082", "tripod"], "873": ["n04486054", "triumphal_arch"], "874": ["n04487081", "trolleybus"], "875": ["n04487394", "trombone"], "876": ["n04493381", "tub"], "877": ["n04501370", "turnstile"], "878": ["n04505470", "typewriter_keyboard"], "879": ["n04507155", "umbrella"], "880": ["n04509417", "unicycle"], "881": ["n04515003", "upright"], "882": ["n04517823", "vacuum"], "883": ["n04522168", "vase"], "884": ["n04523525", "vault"], "885": ["n04525038", "velvet"], "886": ["n04525305", "vending_machine"], "887": ["n04532106", "vestment"], "888": ["n04532670", "viaduct"], "889": ["n04536866", "violin"], "890": ["n04540053", "volleyball"], "891": ["n04542943", "waffle_iron"], "892": ["n04548280", "wall_clock"], "893": ["n04548362", "wallet"], "894": ["n04550184", "wardrobe"], "895": ["n04552348", "warplane"], "896": ["n04553703", "washbasin"], "897": ["n04554684", "washer"], "898": ["n04557648", "water_bottle"], "899": ["n04560804", "water_jug"], "900": ["n04562935", "water_tower"], "901": ["n04579145", "whiskey_jug"], "902": ["n04579432", "whistle"], "903": ["n04584207", "wig"], "904": ["n04589890", "window_screen"], "905": ["n04590129", "window_shade"], "906": ["n04591157", "Windsor_tie"], "907": ["n04591713", "wine_bottle"], "908": ["n04592741", "wing"], "909": ["n04596742", "wok"], "910": ["n04597913", "wooden_spoon"], "911": ["n04599235", "wool"], "912": ["n04604644", "worm_fence"], "913": ["n04606251", "wreck"], "914": ["n04612504", "yawl"], "915": ["n04613696", "yurt"], "916": ["n06359193", "web_site"], "917": ["n06596364", "comic_book"], "918": ["n06785654", "crossword_puzzle"], "919": ["n06794110", "street_sign"], "920": ["n06874185", "traffic_light"], "921": ["n07248320", "book_jacket"], "922": ["n07565083", "menu"], "923": ["n07579787", "plate"], "924": ["n07583066", "guacamole"], "925": ["n07584110", "consomme"], "926": ["n07590611", "hot_pot"], "927": ["n07613480", "trifle"], "928": ["n07614500", "ice_cream"], "929": ["n07615774", "ice_lolly"], "930": ["n07684084", "French_loaf"], "931": ["n07693725", "bagel"], "932": ["n07695742", "pretzel"], "933": ["n07697313", "cheeseburger"], "934": ["n07697537", "hotdog"], "935": ["n07711569", "mashed_potato"], "936": ["n07714571", "head_cabbage"], "937": ["n07714990", "broccoli"], "938": ["n07715103", "cauliflower"], "939": ["n07716358", "zucchini"], "940": ["n07716906", "spaghetti_squash"], "941": ["n07717410", "acorn_squash"], "942": ["n07717556", "butternut_squash"], "943": ["n07718472", "cucumber"], "944": ["n07718747", "artichoke"], "945": ["n07720875", "bell_pepper"], "946": ["n07730033", "cardoon"], "947": ["n07734744", "mushroom"], "948": ["n07742313", "Granny_Smith"], "949": ["n07745940", "strawberry"], "950": ["n07747607", "orange"], "951": ["n07749582", "lemon"], "952": ["n07753113", "fig"], "953": ["n07753275", "pineapple"], "954": ["n07753592", "banana"], "955": ["n07754684", "jackfruit"], "956": ["n07760859", "custard_apple"], "957": ["n07768694", "pomegranate"], "958": ["n07802026", "hay"], "959": ["n07831146", "carbonara"], "960": ["n07836838", "chocolate_sauce"], "961": ["n07860988", "dough"], "962": ["n07871810", "meat_loaf"], "963": ["n07873807", "pizza"], "964": ["n07875152", "potpie"], "965": ["n07880968", "burrito"], "966": ["n07892512", "red_wine"], "967": ["n07920052", "espresso"], "968": ["n07930864", "cup"], "969": ["n07932039", "eggnog"], "970": ["n09193705", "alp"], "971": ["n09229709", "bubble"], "972": ["n09246464", "cliff"], "973": ["n09256479", "coral_reef"], "974": ["n09288635", "geyser"], "975": ["n09332890", "lakeside"], "976": ["n09399592", "promontory"], "977": ["n09421951", "sandbar"], "978": ["n09428293", "seashore"], "979": ["n09468604", "valley"], "980": ["n09472597", "volcano"], "981": ["n09835506", "ballplayer"], "982": ["n10148035", "groom"], "983": ["n10565667", "scuba_diver"], "984": ["n11879895", "rapeseed"], "985": ["n11939491", "daisy"], "986": ["n12057211", "yellow_lady's_slipper"], "987": ["n12144580", "corn"], "988": ["n12267677", "acorn"], "989": ["n12620546", "hip"], "990": ["n12768682", "buckeye"], "991": ["n12985857", "coral_fungus"], "992": ["n12998815", "agaric"], "993": ["n13037406", "gyromitra"], "994": ["n13040303", "stinkhorn"], "995": ["n13044778", "earthstar"], "996": ["n13052670", "hen-of-the-woods"], "997": ["n13054560", "bolete"], "998": ["n13133613", "ear"], "999": ["n15075141", "toilet_tissue"]}
\ No newline at end of file
diff --git a/_static/images/arrow-down-orange.svg b/_static/images/arrow-down-orange.svg
new file mode 100755
index 00000000000..e9d8e9ecf24
--- /dev/null
+++ b/_static/images/arrow-down-orange.svg
@@ -0,0 +1,19 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<svg width="13px" height="13px" viewBox="0 0 13 13" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+    <!-- Generator: Sketch 51.2 (57519) - http://www.bohemiancoding.com/sketch -->
+    <title>Group 5</title>
+    <desc>Created with Sketch.</desc>
+    <defs></defs>
+    <g id="desktop" stroke="none" stroke-width="1" fill="none" fill-rule="evenodd">
+        <g id="Sphinx_Elements" transform="translate(-119.000000, -5164.000000)" fill="#EE4C2C">
+            <g id="Main-Copy-6" transform="translate(98.000000, 4927.000000)">
+                <g id="Group-6" transform="translate(2.000000, 220.000000)">
+                    <g id="Group-5" transform="translate(25.500000, 23.500000) rotate(90.000000) translate(-25.500000, -23.500000) translate(19.000000, 17.000000)">
+                        <polygon id="Fill-1" points="4.87283333 1.2818 8.83956667 5.24766667 8.83956667 2.6832 6.1555 0"></polygon>
+                        <polygon id="Fill-2" points="8.83913333 5.2481 0 5.2481 0 7.06116667 8.83913333 7.06116667 4.8724 11.0279 6.15506667 12.3105667 12.311 6.15463333 11.4036 5.2481"></polygon>
+                    </g>
+                </g>
+            </g>
+        </g>
+    </g>
+</svg>
\ No newline at end of file
diff --git a/_static/images/arrow-right-with-tail.svg b/_static/images/arrow-right-with-tail.svg
new file mode 100755
index 00000000000..5843588fca6
--- /dev/null
+++ b/_static/images/arrow-right-with-tail.svg
@@ -0,0 +1,19 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<svg width="13px" height="13px" viewBox="0 0 13 13" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+    <!-- Generator: Sketch 51.2 (57519) - http://www.bohemiancoding.com/sketch -->
+    <title>Page 1</title>
+    <desc>Created with Sketch.</desc>
+    <defs></defs>
+    <g id="desktop" stroke="none" stroke-width="1" fill="none" fill-rule="evenodd">
+        <g id="01_Home" transform="translate(-1202.000000, -1930.000000)" fill="#EE4C2C">
+            <g id="community" transform="translate(-18.000000, 1075.000000)">
+                <g id="newsletter" transform="translate(740.000000, 843.000000)">
+                    <g id="Page-1" transform="translate(480.000000, 12.000000)">
+                        <polygon id="Fill-1" points="4.87266 1.28206 8.83939333 5.24792667 8.83939333 2.68346 6.15532667 0.00026"></polygon>
+                        <polygon id="Fill-2" points="8.83896 5.24836 -0.000173333333 5.24836 -0.000173333333 7.06142667 8.83896 7.06142667 4.87222667 11.02816 6.15489333 12.3108267 12.3108267 6.15489333 11.4034267 5.24836"></polygon>
+                    </g>
+                </g>
+            </g>
+        </g>
+    </g>
+</svg>
\ No newline at end of file
diff --git a/_static/images/chevron-down-grey.svg b/_static/images/chevron-down-grey.svg
new file mode 100755
index 00000000000..82d6514f250
--- /dev/null
+++ b/_static/images/chevron-down-grey.svg
@@ -0,0 +1,18 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!-- Generator: Adobe Illustrator 17.1.0, SVG Export Plug-In . SVG Version: 6.00 Build 0)  -->
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<svg version="1.1" id="Layer_1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px" y="0px"
+	 viewBox="0.4 0.2 12.4 7.2" enable-background="new 0.4 0.2 12.4 7.2" xml:space="preserve">
+<desc>Created with Sketch.</desc>
+<g id="desktop">
+	<g id="Cloud-Button-Sequence" transform="translate(-1311.000000, -3818.000000)">
+		<g id="GetStarted-Copy-3" transform="translate(98.000000, 3249.000000)">
+			<g id="Via-CLoud" transform="translate(855.000000, 212.000000)">
+				<g id="Button" transform="translate(0.000000, 328.000000)">
+					<polyline id="Page-1" fill="none" stroke="#C7C7C7" stroke-width="2" points="370.1,29.9 364.6,35.1 359.1,29.9 					"/>
+				</g>
+			</g>
+		</g>
+	</g>
+</g>
+</svg>
diff --git a/_static/images/chevron-right-orange.svg b/_static/images/chevron-right-orange.svg
new file mode 100755
index 00000000000..7033fc93bf4
--- /dev/null
+++ b/_static/images/chevron-right-orange.svg
@@ -0,0 +1,17 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!-- Generator: Adobe Illustrator 17.1.0, SVG Export Plug-In . SVG Version: 6.00 Build 0)  -->
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<svg version="1.1" id="Layer_1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px" y="0px"
+	 viewBox="0.3 0.3 8.2 14.4" enable-background="new 0.3 0.3 8.2 14.4" xml:space="preserve">
+<title>Page 1</title>
+<desc>Created with Sketch.</desc>
+<g id="desktop">
+	<g id="_x30_1_x5F_Home" transform="translate(-864.000000, -1683.000000)">
+		<g id="community" transform="translate(-18.000000, 1391.000000)">
+			<g id="cta" transform="translate(741.000000, 277.000000)">
+				<polyline id="Page-1" fill="none" stroke="#EE4C2C" stroke-width="2" points="142,16 148.1,22.5 142,29 				"/>
+			</g>
+		</g>
+	</g>
+</g>
+</svg>
diff --git a/_static/images/chevron-right-white.svg b/_static/images/chevron-right-white.svg
new file mode 100755
index 00000000000..dd9e77f2616
--- /dev/null
+++ b/_static/images/chevron-right-white.svg
@@ -0,0 +1,17 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!-- Generator: Adobe Illustrator 17.1.0, SVG Export Plug-In . SVG Version: 6.00 Build 0)  -->
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<svg version="1.1" id="Layer_1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px" y="0px"
+     viewBox="0.3 0.3 8.2 14.4" enable-background="new 0.3 0.3 8.2 14.4" xml:space="preserve">
+<title>Page 1</title>
+<desc>Created with Sketch.</desc>
+<g id="desktop">
+    <g id="_x30_1_x5F_Home" transform="translate(-864.000000, -1683.000000)">
+        <g id="community" transform="translate(-18.000000, 1391.000000)">
+            <g id="cta" transform="translate(741.000000, 277.000000)">
+                <polyline id="Page-1" fill="none" stroke="#FFFFFF" stroke-width="2" points="142,16 148.1,22.5 142,29                 "/>
+            </g>
+        </g>
+    </g>
+</g>
+</svg>
\ No newline at end of file
diff --git a/_static/images/home-footer-background.jpg b/_static/images/home-footer-background.jpg
new file mode 100755
index 00000000000..b307bb57f48
Binary files /dev/null and b/_static/images/home-footer-background.jpg differ
diff --git a/_static/images/icon-close.svg b/_static/images/icon-close.svg
new file mode 100755
index 00000000000..348964e79f7
--- /dev/null
+++ b/_static/images/icon-close.svg
@@ -0,0 +1,21 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<svg width="20px" height="21px" viewBox="0 0 20 21" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+    <!-- Generator: Sketch 51.2 (57519) - http://www.bohemiancoding.com/sketch -->
+    <title>Page 1</title>
+    <desc>Created with Sketch.</desc>
+    <defs></defs>
+    <g id="desktop" stroke="none" stroke-width="1" fill="none" fill-rule="evenodd">
+        <g id="Behaviors" transform="translate(-421.000000, -1035.000000)" fill="#BBBBBA">
+            <g id="mobile" transform="translate(-3.000000, -4.000000)">
+                <g id="expanded-menu" transform="translate(92.000000, 949.000000)">
+                    <g id="Group-2" transform="translate(26.000000, 85.000000)">
+                        <g id="Page-1" transform="translate(316.000000, 15.503381) rotate(-21.000000) translate(-316.000000, -15.503381) translate(304.000000, 3.503381)">
+                            <polygon id="Fill-1" points="7.8940033 -8.03165789e-06 5.75107666 0.955277357 10.3761069 11.3312155 8.03165788e-06 15.9562457 0.955213104 18.0992527 11.3313119 13.4743028 15.9562618 23.850241 18.0993491 22.8950359 7.89416393 -8.03165789e-06"></polygon>
+                            <polygon id="Fill-2" points="17.7071311 8.06344719 18.6623362 10.2066148 23.8503053 7.89405952 22.8951002 5.75097225"></polygon>
+                        </g>
+                    </g>
+                </g>
+            </g>
+        </g>
+    </g>
+</svg>
\ No newline at end of file
diff --git a/_static/images/icon-menu-dots-dark.svg b/_static/images/icon-menu-dots-dark.svg
new file mode 100755
index 00000000000..fa2ad044b3f
--- /dev/null
+++ b/_static/images/icon-menu-dots-dark.svg
@@ -0,0 +1,42 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<svg width="25px" height="6px" viewBox="0 0 25 6" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+    <!-- Generator: Sketch 51.2 (57519) - http://www.bohemiancoding.com/sketch -->
+    <title>Page 1</title>
+    <desc>Created with Sketch.</desc>
+    <defs>
+        <polygon id="path-1" points="0 0.4815 4.81659091 0.4815 4.81659091 5.249925 0 5.249925"></polygon>
+        <polygon id="path-3" points="0.683560606 0.4815 5.50022727 0.4815 5.50022727 5.249925 0.683560606 5.249925"></polygon>
+        <polygon id="path-5" points="0.61030303 0.4815 5.42621212 0.4815 5.42621212 5.249925 0.61030303 5.249925"></polygon>
+    </defs>
+    <g id="desktop" stroke="none" stroke-width="1" fill="none" fill-rule="evenodd">
+        <g id="Behaviors" transform="translate(-856.000000, -249.000000)">
+            <g id="mobile" transform="translate(-3.000000, -4.000000)">
+                <g id="condensed-menu-scroll" transform="translate(530.000000, 158.000000)">
+                    <g id="Page-1" transform="translate(329.000000, 95.000000)">
+                        <g id="Group-3">
+                            <mask id="mask-2" fill="white">
+                                <use xlink:href="#path-1"></use>
+                            </mask>
+                            <g id="Clip-2"></g>
+                            <path d="M-7.57575758e-05,2.865675 C-7.57575758e-05,1.548675 1.07871212,0.481425 2.40825758,0.481425 C3.73856061,0.481425 4.81659091,1.548675 4.81659091,2.865675 C4.81659091,4.182675 3.73856061,5.249925 2.40825758,5.249925 C1.07871212,5.249925 -7.57575758e-05,4.182675 -7.57575758e-05,2.865675" id="Fill-1" fill-opacity="0.5" fill="#8C8C8C" mask="url(#mask-2)"></path>
+                        </g>
+                        <g id="Group-6" transform="translate(9.000000, 0.000000)">
+                            <mask id="mask-4" fill="white">
+                                <use xlink:href="#path-3"></use>
+                            </mask>
+                            <g id="Clip-5"></g>
+                            <path d="M0.683560606,2.865675 C0.683560606,1.548675 1.76234848,0.481425 3.09265152,0.481425 C4.42295455,0.481425 5.50022727,1.548675 5.50022727,2.865675 C5.50022727,4.182675 4.42295455,5.249925 3.09265152,5.249925 C1.76234848,5.249925 0.683560606,4.182675 0.683560606,2.865675" id="Fill-4" fill-opacity="0.5" fill="#8C8C8C" mask="url(#mask-4)"></path>
+                        </g>
+                        <g id="Group-9" transform="translate(19.000000, 0.000000)">
+                            <mask id="mask-6" fill="white">
+                                <use xlink:href="#path-5"></use>
+                            </mask>
+                            <g id="Clip-8"></g>
+                            <path d="M0.61030303,2.865675 C0.61030303,1.548675 1.68833333,0.481425 3.01863636,0.481425 C4.34893939,0.481425 5.42621212,1.548675 5.42621212,2.865675 C5.42621212,4.182675 4.34893939,5.249925 3.01863636,5.249925 C1.68833333,5.249925 0.61030303,4.182675 0.61030303,2.865675" id="Fill-7" fill-opacity="0.5" fill="#8C8C8C" mask="url(#mask-6)"></path>
+                        </g>
+                    </g>
+                </g>
+            </g>
+        </g>
+    </g>
+</svg>
\ No newline at end of file
diff --git a/_static/images/logo-dark.svg b/_static/images/logo-dark.svg
new file mode 100755
index 00000000000..9b4c1a56ac6
--- /dev/null
+++ b/_static/images/logo-dark.svg
@@ -0,0 +1,30 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!-- Generator: Adobe Illustrator 17.1.0, SVG Export Plug-In . SVG Version: 6.00 Build 0)  -->
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<svg version="1.1" id="Layer_1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px" y="0px"
+	 viewBox="0.6 539.9 487.3 120.2" enable-background="new 0.6 539.9 487.3 120.2" xml:space="preserve">
+<g>
+	<path fill="#EE4C2C" d="M63.1,567.3l-6.6,6.6c10.8,10.8,10.8,28.2,0,38.8c-10.8,10.8-28.2,10.8-38.8,0c-10.8-10.8-10.8-28.2,0-38.8
+		l0,0l17.1-17.1l2.4-2.4l0,0v-12.9l-25.8,25.8c-14.4,14.4-14.4,37.6,0,52s37.6,14.4,51.7,0C77.5,604.8,77.5,581.7,63.1,567.3z"/>
+	<circle fill="#EE4C2C" cx="50.2" cy="560.9" r="4.8"/>
+</g>
+<g>
+	<g>
+		<path d="M129.8,600.3h-11.1v28.5h-8.4v-81.1c0,0,19.2,0,20.4,0c21.3,0,31.5,10.5,31.5,25.2C162.5,591,149.9,600.3,129.8,600.3z
+			 M130.7,555.8c-0.9,0-11.7,0-11.7,0v37.3l11.4-0.3c15.3-0.3,23.7-6.3,23.7-18.9C154.1,562.1,145.7,555.8,130.7,555.8z"/>
+		<path d="M199.8,628.5l-4.8,12.9c-5.4,14.4-11.1,18.6-19.2,18.6c-4.5,0-7.8-1.2-11.4-2.7l2.4-7.5c2.7,1.5,5.7,2.7,9,2.7
+			c4.5,0,7.8-2.4,12.3-13.8l3.9-10.5l-23.1-58.6h8.7l18.6,49l18.3-49h8.4L199.8,628.5z"/>
+		<path d="M250.3,555.8v73.3h-8.4v-73.3h-28.5V548h65.2v7.8C278.5,555.8,250.3,555.8,250.3,555.8z"/>
+		<path d="M302.3,630.6c-16.5,0-28.5-12.3-28.5-31.2c0-18.9,12.6-31.5,29.4-31.5s28.5,12.3,28.5,31.2
+			C331.4,618,318.8,630.6,302.3,630.6z M302.6,575.4c-12.6,0-20.7,9.9-20.7,24c0,14.4,8.4,24.3,21,24.3s20.7-9.9,20.7-24
+			C323.6,585,315.2,575.4,302.6,575.4z"/>
+		<path d="M351.8,629.1h-8.1v-59.5l8.1-1.8v12.6c3.9-7.5,9.6-12.6,17.4-12.6c3.9,0,7.5,1.2,10.5,2.7l-2.1,7.5
+			c-2.7-1.5-5.7-2.7-9-2.7c-6.3,0-12,4.8-16.8,15.3V629.1L351.8,629.1z"/>
+		<path d="M411.3,630.6c-18,0-29.1-12.9-29.1-31.2c0-18.6,12.3-31.5,29.1-31.5c7.2,0,13.5,1.8,18.6,5.1l-2.1,7.2
+			c-4.5-3-10.2-4.8-16.5-4.8c-12.9,0-20.7,9.6-20.7,23.7c0,14.4,8.4,24,21,24c6,0,12-1.8,16.5-4.8l1.8,7.5
+			C424.5,628.8,418.2,630.6,411.3,630.6z"/>
+		<path d="M479.5,629.1v-38.5c0-10.5-4.2-15-12.6-15c-6.9,0-13.5,3.6-18.3,8.4v45.1h-8.1v-87.4l8.1-1.8c0,0,0,37.3,0,37.6
+			c6.3-6.3,14.1-9.3,20.7-9.3c11.4,0,18.6,7.5,18.6,20.4v40.6H479.5z"/>
+	</g>
+</g>
+</svg>
diff --git a/_static/images/logo-facebook-dark.svg b/_static/images/logo-facebook-dark.svg
new file mode 100755
index 00000000000..cff17915c4f
--- /dev/null
+++ b/_static/images/logo-facebook-dark.svg
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!-- Generator: Adobe Illustrator 17.1.0, SVG Export Plug-In . SVG Version: 6.00 Build 0)  -->
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<svg version="1.1" id="Layer_1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px" y="0px"
+	 viewBox="88.4 13.1 1298 2499.4" enable-background="new 88.4 13.1 1298 2499.4" xml:space="preserve">
+<path fill="#2D2D2D" d="M930.9,2512.5V1372.4h382.7l57.3-444.3h-440V644.4c0-128.6,35.7-216.3,220.2-216.3l235.3-0.1V30.6
+	c-40.7-5.4-180.4-17.5-342.8-17.5c-339.2,0-571.5,207.1-571.5,587.3v327.7H88.4v444.3h383.7v1140.1L930.9,2512.5L930.9,2512.5z"/>
+</svg>
diff --git a/_static/images/logo-icon.svg b/_static/images/logo-icon.svg
new file mode 100755
index 00000000000..575f6823e47
--- /dev/null
+++ b/_static/images/logo-icon.svg
@@ -0,0 +1,12 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!-- Generator: Adobe Illustrator 17.1.0, SVG Export Plug-In . SVG Version: 6.00 Build 0)  -->
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<svg version="1.1" id="Layer_1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px" y="0px"
+	 viewBox="0.6 1067.9 90.3 109.1" enable-background="new 0.6 1067.9 90.3 109.1" xml:space="preserve">
+<g>
+	<path fill="#EE4C2C" d="M77.6,1099.6l-8.1,8.1c13.3,13.3,13.3,34.7,0,47.8c-13.3,13.3-34.7,13.3-47.8,0
+		c-13.3-13.3-13.3-34.7,0-47.8l0,0l21.1-21.1l3-3l0,0v-15.9l-31.8,31.8c-17.7,17.7-17.7,46.3,0,64c17.7,17.7,46.3,17.7,63.7,0
+		C95.3,1145.8,95.3,1117.4,77.6,1099.6z"/>
+	<circle fill="#EE4C2C" cx="61.7" cy="1091.8" r="5.9"/>
+</g>
+</svg>
diff --git a/_static/images/logo-twitter-dark.svg b/_static/images/logo-twitter-dark.svg
new file mode 100755
index 00000000000..1572570f88c
--- /dev/null
+++ b/_static/images/logo-twitter-dark.svg
@@ -0,0 +1,16 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!-- Generator: Adobe Illustrator 17.1.0, SVG Export Plug-In . SVG Version: 6.00 Build 0)  -->
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<svg version="1.1"
+	 id="svg3626" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:cc="http://creativecommons.org/ns#" xmlns:svg="http://www.w3.org/2000/svg"
+	 xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px" y="0px" viewBox="0 0 300 244.2"
+	 enable-background="new 0 0 300 244.2" xml:space="preserve">
+<g id="layer1" transform="translate(-539.17946,-568.85777)">
+	<path id="path3611" fill="#2D2D2D" d="M633.9,812c112.5,0,174-93.2,174-174c0-2.6-0.1-5.3-0.2-7.9c11.9-8.6,22.3-19.4,30.5-31.7
+		c-11,4.9-22.7,8.1-35.1,9.6c12.6-7.6,22.3-19.5,26.9-33.8c-11.8,7-24.9,12.1-38.8,14.8c-11.2-11.9-27-19.3-44.6-19.3
+		c-33.8,0-61.1,27.4-61.1,61.1c0,4.8,0.5,9.5,1.6,13.9c-50.8-2.6-95.9-26.9-126-63.9c-5.3,9-8.3,19.5-8.3,30.7
+		c0,21.2,10.8,39.9,27.2,50.9c-10-0.3-19.5-3.1-27.7-7.6c0,0.3,0,0.5,0,0.8c0,29.6,21.1,54.3,49.1,59.9c-5.1,1.4-10.5,2.2-16.1,2.2
+		c-3.9,0-7.8-0.4-11.5-1.1c7.8,24.3,30.4,42,57.1,42.5c-20.9,16.4-47.3,26.2-75.9,26.2c-4.9,0-9.8-0.3-14.6-0.8
+		C567.2,801.9,599.4,812,633.9,812"/>
+</g>
+</svg>
diff --git a/_static/images/logo.svg b/_static/images/logo.svg
new file mode 100755
index 00000000000..f8d44b98425
--- /dev/null
+++ b/_static/images/logo.svg
@@ -0,0 +1,31 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!-- Generator: Adobe Illustrator 17.1.0, SVG Export Plug-In . SVG Version: 6.00 Build 0)  -->
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<svg version="1.1" id="Layer_1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px" y="0px"
+	 viewBox="0.6 539.9 487.3 120.2" enable-background="new 0.6 539.9 487.3 120.2" xml:space="preserve">
+<g>
+	<path fill="#EE4C2C" d="M63.1,567.3l-6.6,6.6c10.8,10.8,10.8,28.2,0,38.8c-10.8,10.8-28.2,10.8-38.8,0c-10.8-10.8-10.8-28.2,0-38.8
+		l0,0l17.1-17.1l2.4-2.4l0,0v-12.9l-25.8,25.8c-14.4,14.4-14.4,37.6,0,52s37.6,14.4,51.7,0C77.5,604.8,77.5,581.7,63.1,567.3z"/>
+	<circle fill="#EE4C2C" cx="50.2" cy="560.9" r="4.8"/>
+</g>
+<g>
+	<g>
+		<path fill="#FFFFFF" d="M129.8,600.3h-11.1v28.5h-8.4v-81.1c0,0,19.2,0,20.4,0c21.3,0,31.5,10.5,31.5,25.2
+			C162.5,591,149.9,600.3,129.8,600.3z M130.7,555.8c-0.9,0-11.7,0-11.7,0v37.3l11.4-0.3c15.3-0.3,23.7-6.3,23.7-18.9
+			C154.1,562.1,145.7,555.8,130.7,555.8z"/>
+		<path fill="#FFFFFF" d="M199.8,628.5l-4.8,12.9c-5.4,14.4-11.1,18.6-19.2,18.6c-4.5,0-7.8-1.2-11.4-2.7l2.4-7.5
+			c2.7,1.5,5.7,2.7,9,2.7c4.5,0,7.8-2.4,12.3-13.8l3.9-10.5l-23.1-58.6h8.7l18.6,49l18.3-49h8.4L199.8,628.5z"/>
+		<path fill="#FFFFFF" d="M250.3,555.8v73.3h-8.4v-73.3h-28.5V548h65.2v7.8C278.5,555.8,250.3,555.8,250.3,555.8z"/>
+		<path fill="#FFFFFF" d="M302.3,630.6c-16.5,0-28.5-12.3-28.5-31.2c0-18.9,12.6-31.5,29.4-31.5s28.5,12.3,28.5,31.2
+			C331.4,618,318.8,630.6,302.3,630.6z M302.6,575.4c-12.6,0-20.7,9.9-20.7,24c0,14.4,8.4,24.3,21,24.3s20.7-9.9,20.7-24
+			C323.6,585,315.2,575.4,302.6,575.4z"/>
+		<path fill="#FFFFFF" d="M351.8,629.1h-8.1v-59.5l8.1-1.8v12.6c3.9-7.5,9.6-12.6,17.4-12.6c3.9,0,7.5,1.2,10.5,2.7l-2.1,7.5
+			c-2.7-1.5-5.7-2.7-9-2.7c-6.3,0-12,4.8-16.8,15.3V629.1L351.8,629.1z"/>
+		<path fill="#FFFFFF" d="M411.3,630.6c-18,0-29.1-12.9-29.1-31.2c0-18.6,12.3-31.5,29.1-31.5c7.2,0,13.5,1.8,18.6,5.1l-2.1,7.2
+			c-4.5-3-10.2-4.8-16.5-4.8c-12.9,0-20.7,9.6-20.7,23.7c0,14.4,8.4,24,21,24c6,0,12-1.8,16.5-4.8l1.8,7.5
+			C424.5,628.8,418.2,630.6,411.3,630.6z"/>
+		<path fill="#FFFFFF" d="M479.5,629.1v-38.5c0-10.5-4.2-15-12.6-15c-6.9,0-13.5,3.6-18.3,8.4v45.1h-8.1v-87.4l8.1-1.8
+			c0,0,0,37.3,0,37.6c6.3-6.3,14.1-9.3,20.7-9.3c11.4,0,18.6,7.5,18.6,20.4v40.6H479.5z"/>
+	</g>
+</g>
+</svg>
diff --git a/_static/images/microsoft-logo.svg b/_static/images/microsoft-logo.svg
new file mode 100644
index 00000000000..a1a7ce2d7a7
--- /dev/null
+++ b/_static/images/microsoft-logo.svg
@@ -0,0 +1,80 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!-- Generator: Adobe Illustrator 25.2.3, SVG Export Plug-In . SVG Version: 6.00 Build 0)  -->
+<svg version="1.1" id="Layer_1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px" y="0px"
+	 viewBox="0 0 1024 1024" style="enable-background:new 0 0 1024 1024;" xml:space="preserve">
+<image style="overflow:visible;" width="1024" height="1024" xlink:href="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAABAAAAAQACAMAAABIw9uxAAAABGdBTUEAALGPC/xhBQAAACBjSFJN
+AAB6JgAAgIQAAPoAAACA6AAAdTAAAOpgAAA6mAAAF3CculE8AAAATlBMVEXz8/Pzz8Tzp5HzuqrL
+35+92YLZ573zUyXzfVqfykOBvAbznIO11XKf2PJDuvFyyfL61HP8yUX336CBzvIFpvD/ugj52IO9
+4vP25r7///8wcQ+eAAAAAWJLR0QZ7G61iAAAAAd0SU1FB+QEEgsoICT3q2MAAA8eSURBVHja7dRL
+EoJAEETBkY+CCgooev+TegV7MYsO8l2horIUSZIkSZIkSZIkSZIkSZIkSZIkSZIkSZIkSZIkSZIk
+SZIkSZIkSZIkSZIkSZIkSZIkSZIkSZIkSZIkSZIkSZIkSZIO3KlR2trI0l2vtJ2rAdBclLYhsvR4
+Vdp6AAgAAACAAAAAAAgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+QAAAAAAEAAAAQAAQAAQAAUAAEAAEAAFAABAABAABQAAQAAQAAUAAEAAEAAFAABAABAABQAAQAAQA
+AUAAEAAEAAFAABAABAABQAAQAAQAAUAAEAAEAAFAABAABAABAAAAEAAAAAABAAAAEAAAAAAAAAAA
+AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAuBEAAAAAAQAAABAAAAAAAUAAEAAEAAFAABAA
+BAABQAAQAAQAAUAAEAAEAAFAABAABAABQAAQAAQAAUAAEAAEAAFAABAABAABQAAQAAQAAUAAEAAE
+AAFAABAABAABQAAQAAQAAABAAAAAAAQAAABAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+AAAAAAAAAAAgAAAAAAIAAAAgAAAAAAKAACAACAACgAAgAAgAAoAAIAAIAAKAACAACAACgAAgAAgA
+AoAAIAAIAAKAACAACAACgAAgAAgAAoAAIAAIAAKAACAACAACgAAgAAgAAoAAIAAIAAAAgAAAAAAI
+AAAAgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAQAAAAAAEAAAAQAAAAAAE
+AAFAABAABAABQAAQAAQAAUAAEAAEAAFAABAABAABQAAQAAQAAUAAEAAEAAFAABAABAABQAAQAAQA
+AUAAEAAEAAFAABAABAABQAAQAAQAAUAAEAAAAAABAAAAEAAAAAABAAAAAAAAAAAAAAAAAAAAAAAA
+AAAAAAAAAAAAAAAAAAAAAAAAAIAAAAAACAAAAIAAAAAACAACgAAgAAgAAoAAIAAIAAKAACAACAAC
+gAAgAAgAAoAAIAAIAAKAACAACAACgAAgAAgAAoAAIAAIAAKAACAACAACgAAgAAgAAoAAIAAIAAKA
+ACAAAAAAAgAAACAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAEA
+AAAQAAAAAAEAAAAQAAQAAUAAEAAEAAFAABAABAABQAAQAAQAAUAAEAAEAAFAABAABAABQAAQAAQA
+AUAAEAAEAAFAABAABAABQAAQAAQAAUAAEAAEAAFAABAABAABQAAAAAAEAAAAQAAAAAAAAAAAAAAA
+AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAgAAACAAAAAAAoAbAUAAEAAEAAFAABAA
+BAABQAAQAAQAAUAAEAAEAAFAABAABAABQAAQAAQAAUAAEAAEAAFAABAABAABQAAQAAQAAUAAEAAE
+AAFAABAABAABQAAQAAAAAAEAAAAQAAAAAAEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+AAAAAAAAAAAAAAAACAAAAIAAAAAACAACgAAgAAgAAoAAIAAIAAKAACAACAACgAAgAAgAAoAAIAAI
+AAKAACAACAACgAAgAAgAAoAAIAAIAAKAACAACAACgAAgAAgAAoAAIAAIAAKAACAAAAAAAgAAACAA
+AAAAAgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAANwIAAAAgAAAAAAIAAAAg
+AAgAAoAAIAAIAAKAACAACAACgAAgAAgAAoAAIAAIAAKAACAACAACgAAgAAgAAoAAIAAIAAKAACAA
+CAACgAAgAAgAAoAAIAAIAAKAACAACAACgAAAAAAIAAAAgAAAAAAIAAAAAAAAAAAAAAAAAAAAAAAA
+AAAAAAAAAAAAAAAAAAAAAAAAAAAEAAAAQAAAAAAEAAAAQAAQAAQAAUAAEAAEAAFAABAABAABQAAQ
+AAQAAUAAEAAEAAFAABAABAABQAAQAAQAAUAAEAAEAAFAABAABAABQAAQAAQAAUAAEAAEAAFAABAA
+BAABAAAAEAAAAAABAAAAEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACAAA
+AIAAAAAACAAAAIAAIAAIAAKAACAACAACgAAgAAgAAoAAIAAIAAKAACAACAACgAAgAAgAAoAAIAAI
+AAKAACAACAACgAAgAAgAAoAAIAAIAAKAACAACAACgAAgAAgAAgAAACAAAAAAAgAAACAAAAAAAAAA
+AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABAAAAAAAQAAABAAAAAAAUAAEAAEAAFAABAA
+BAABQAAQAAQAAUAAEAAEAAFAABAABAABQAAQAAQAAUAAEAAEAAFAABAABAABQAAQAAQAAUAAEAAE
+AAFAABAABAABQAAQAAQAANSoHZS2W2Tp+6i0dUWSJEmSJEmSJEmSJEmSJEmSJEmSJEmSJEmSJEmS
+JEmSJEmSJEmSJEmSJEmSJEmSJEmSJEmSJEmSJEmSJEmSJEmSJP3XNCttj8jSz0VpW6sBsL2Utjmy
+9PJW2nYACAAAAIAAAAAACAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+AABAAAAAAAQAAABAABAABAABQAAQAAQAAUAAEAAEAAFAABAABAABQAAQAAQAAUAAEAAEAAFAABAA
+BAABQAAQAAQAAUAAEAAEAAFAABAABAABQAAQAAQAAUAAEAAEAAEAAAAQAAAAAAEAAAAQAAAAAAAA
+AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAC4EQAAAAABAAAAEAAAAAABQAAQAAQAAUAA
+EAAEAAFAABAABAABQAAQAAQAAUAAEAAEAAFAABAABAABQAAQAAQAAUAAEAAEAAFAABAABAABQAAQ
+AAQAAUAAEAAEAAFAABAABAAAAEAAAAAABAAAAEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+AAAAAAAAAAAAACAAAAAAAgAAACAAAAAAAoAAIAAIAAKAACAACAACgAAgAAgAAoAAIAAIAAKAACAA
+CAACgAAgAAgAAoAAIAAIAAKAACAACAACgAAgAAgAAoAAIAAIAAKAACAACAACgAAgAAgAAACAAAAA
+AAgAAACAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABAAAAAAAQAAABAAAAA
+AAQAAUAAEAAEAAFAABAABAABQAAQAAQAAUAAEAAEAAFAABAABAABQAAQAAQAAUAAEAAEAAFAABAA
+BAABQAAQAAQAAUAAEAAEAAFAABAABAABQAAQAAAAAAEAAAAQAAAAAAEAAAAAAAAAAAAAAAAAAAAA
+AAAAAAAAAAAAAAAAAAAAAAAAAAAAgAAAAAAIAAAAgAAAAAAIAAKAACAACAACgAAgAAgAAoAAIAAI
+AAKAACAACAACgAAgAAgAAoAAIAAIAAKAACAACAACgAAgAAgAAoAAIAAIAAKAACAACAACgAAgAAgA
+AoAAIAAAAAACAAAAIAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+AQAAABAAAAAAAQAAABAABAABQAAQAAQAAUAAEAAEAAFAABAABAABQAAQAAQAAUAAEAAEAAFAABAA
+BAABQAAQAAQAAUAAEAAEAAFAABAABAABQAAQAAQAAUAAEAAEAAFAAAAAAAQAAABAAAAAAAAAAAAA
+AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACAAAAIAAAAAACgBsBQAAQAAQAAUAA
+EAAEAAFAABAABAABQAAQAAQAAUAAEAAEAAFAABAABAABQAAQAAQAAUAAEAAEAAFAABAABAABQAAQ
+AAQAAUAAEAAEAAFAABAAAAAAAQAAABAAAAAAAQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+AAAAAAAAAAAAAAAAAAAIAAAAgAAAAAAIAAKAACAACAACgAAgAAgAAoAAIAAIAAKAACAACAACgAAg
+AAgAAoAAIAAIAAKAACAACAACgAAgAAgAAoAAIAAIAAKAACAACAACgAAgAAgAAoAAIAAAAAACAAAA
+IAAAAAACAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA3AgAAACAAAAAAAgAA
+ACAACAACgAAgAAgAAoAAIAAIAAKAACAACAACgAAgAAgAAoAAIAAIAAKAACAACAACgAAgAAgAAoAA
+IAAIAAKAACAACAACgAAgAAgAAoAAIAAIAAKAAAAAAAgAAACAAAAAAAgAAAAAAAAAAAAAAAAAAAAA
+AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAQAAABAAAAAAAQAAABAABAABAABQAAQAAQAAUAAEAAEAAFA
+ABAABAABQAAQAAQAAUAAEAAEAAFAABAABAABQAAQAAQAAUAAEAAEAAFAABAABAABQAAQAAQAAUAA
+EAAEAAEAAAAQAAAAAAEAAAAQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAI
+AAAAgAAAAAAIAAAAgAAgAAgAAoAAIAAIAAKAACAACAACgAAgAAgAAoAAIAAIAAKAACAACAACgAAg
+AAgAAoAAIAAIAAKAACAACAACgAAgAAgAAoAAIAAIAAKAACAACAACAAAAIAAAAAACAAAAIAAAAAAA
+AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAEAAAAAABAAAAEAAAAAABQAAQAAQAAUAA
+EAAEAAFAABAABAABQAAQAAQAAUAAEAAEAAFAABAABAABQAAQAAQAAUAAEAAEAAFAABAABAABQAAQ
+AAQAAUAAEAAEAAFAABAABAAA1OizKW1TZOl1V9q+RZIkSZIkSZIkSZIkSZIkSZIkSZIkSZIkSZIk
+SZIkSZIkSZIkSZIkSZIkSZIkSZIkSZIkSZIkSZIkSZIkSZIkSZKkA/cDwJ7PmkABLLQAAAAldEVY
+dGRhdGU6Y3JlYXRlADIwMjAtMDQtMThUMTE6NDA6MzIrMDA6MDD4EWKnAAAAJXRFWHRkYXRlOm1v
+ZGlmeQAyMDIwLTA0LTE4VDExOjQwOjMyKzAwOjAwiUzaGwAAAABJRU5ErkJggg==">
+</image>
+</svg>
diff --git a/_static/images/view-page-source-icon.svg b/_static/images/view-page-source-icon.svg
new file mode 100755
index 00000000000..6f5bbe0748f
--- /dev/null
+++ b/_static/images/view-page-source-icon.svg
@@ -0,0 +1,13 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!-- Generator: Adobe Illustrator 22.1.0, SVG Export Plug-In . SVG Version: 6.00 Build 0)  -->
+<svg width="20px" height="20px" version="1.1" id="Layer_1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px" y="0px"
+	 viewBox="0 0 300 225.9" style="enable-background:new 0 0 300 225.9;" xml:space="preserve">
+<style type="text/css">
+	.st0{fill:#EE4C2C;}
+</style>
+<g>
+	<path class="st0" d="M0,0v225.9h300V0H0z M282.4,208.3H17.6V17.6h264.7V208.3z"/>
+	<polygon class="st0" points="64.4,166.1 76.8,178.5 142.4,112.9 76.8,47.3 64.4,59.8 64.4,59.8 117.5,112.9 	"/>
+	<rect x="158.8" y="156.4" class="st0" width="88.2" height="17.6"/>
+</g>
+</svg>
diff --git a/_static/img/8_workers.png b/_static/img/8_workers.png
new file mode 100644
index 00000000000..9a51182eb4b
Binary files /dev/null and b/_static/img/8_workers.png differ
diff --git a/_static/img/ExecuTorch-Logo-cropped.svg b/_static/img/ExecuTorch-Logo-cropped.svg
new file mode 100644
index 00000000000..9e0ef52fbd8
--- /dev/null
+++ b/_static/img/ExecuTorch-Logo-cropped.svg
@@ -0,0 +1,57 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<svg
+   id="Layer_2"
+   viewBox="0 0 51.200001 38.52"
+   width="51.200001"
+   height="38.52"
+   version="1.1"
+   sodipodi:docname="ExecuTorch-Logo-cropped.svg"
+   inkscape:version="1.2.1 (9c6d41e4, 2022-07-14)"
+   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+   xmlns="http://www.w3.org/2000/svg"
+   xmlns:svg="http://www.w3.org/2000/svg">
+  <sodipodi:namedview
+     id="namedview15"
+     pagecolor="#ffffff"
+     bordercolor="#000000"
+     borderopacity="0.25"
+     inkscape:showpageshadow="2"
+     inkscape:pageopacity="0.0"
+     inkscape:pagecheckerboard="0"
+     inkscape:deskcolor="#d1d1d1"
+     showgrid="false"
+     inkscape:zoom="8.0613964"
+     inkscape:cx="18.235054"
+     inkscape:cy="7.6289512"
+     inkscape:window-width="1680"
+     inkscape:window-height="819"
+     inkscape:window-x="0"
+     inkscape:window-y="25"
+     inkscape:window-maximized="0"
+     inkscape:current-layer="Layer_2" />
+  <defs
+     id="defs4">
+    <style
+       id="style2">.cls-1{fill:#cc2faa;}</style>
+  </defs>
+  <path
+     class="cls-1"
+     d="m 26.89,12.15 c 1.27,-1.27 3.33,-1.27 4.59,0 1.26,1.27 1.26,3.32 0,4.59 -1.26,1.27 -3.33,1.27 -4.59,0 -1.26,-1.27 -1.26,-3.32 0,-4.59"
+     id="path6" />
+  <polygon
+     class="cls-1"
+     points="16.1,27.25 16.11,21.52 39.95,45.19 51.49,45.17 51.53,22.49 55.6,18.42 55.55,49.23 38.27,49.26 "
+     id="polygon8"
+     transform="translate(-4.4,-10.74)" />
+  <polygon
+     class="cls-1"
+     points="4.4,41.62 4.45,10.77 21.74,10.74 30.38,19.31 27.5,22.19 20.05,14.81 8.52,14.83 8.48,37.55 "
+     id="polygon10"
+     transform="translate(-4.4,-10.74)" />
+  <polygon
+     class="cls-1"
+     points="39.52,28.41 44.48,33.33 44.47,39.06 36.66,31.31 "
+     id="polygon12"
+     transform="translate(-4.4,-10.74)" />
+</svg>
diff --git a/_static/img/ONNXLive.png b/_static/img/ONNXLive.png
new file mode 100755
index 00000000000..74fd4c891aa
Binary files /dev/null and b/_static/img/ONNXLive.png differ
diff --git a/_static/img/SRResNet.png b/_static/img/SRResNet.png
new file mode 100755
index 00000000000..8e3070fc65e
Binary files /dev/null and b/_static/img/SRResNet.png differ
diff --git a/_static/img/Variable.png b/_static/img/Variable.png
old mode 100644
new mode 100755
index 6576cc8704e..a4455d7500a
Binary files a/_static/img/Variable.png and b/_static/img/Variable.png differ
diff --git a/_static/img/audio_preprocessing_tutorial_waveform.png b/_static/img/audio_preprocessing_tutorial_waveform.png
new file mode 100644
index 00000000000..320b53c996d
Binary files /dev/null and b/_static/img/audio_preprocessing_tutorial_waveform.png differ
diff --git a/_static/img/autodiff.png b/_static/img/autodiff.png
new file mode 100644
index 00000000000..a102bdeee4e
Binary files /dev/null and b/_static/img/autodiff.png differ
diff --git a/_static/img/ax_logo.png b/_static/img/ax_logo.png
new file mode 100644
index 00000000000..ecb4302b524
Binary files /dev/null and b/_static/img/ax_logo.png differ
diff --git a/_static/img/ax_scheduler_illustration.png b/_static/img/ax_scheduler_illustration.png
new file mode 100644
index 00000000000..65e5a004a1b
Binary files /dev/null and b/_static/img/ax_scheduler_illustration.png differ
diff --git a/_static/img/basics/comp-graph.png b/_static/img/basics/comp-graph.png
new file mode 100644
index 00000000000..cfa6163d58a
Binary files /dev/null and b/_static/img/basics/comp-graph.png differ
diff --git a/_static/img/basics/fashion_mnist.png b/_static/img/basics/fashion_mnist.png
new file mode 100644
index 00000000000..213b1e1f17b
Binary files /dev/null and b/_static/img/basics/fashion_mnist.png differ
diff --git a/_static/img/basics/optimizationloops.png b/_static/img/basics/optimizationloops.png
new file mode 100644
index 00000000000..c43d83f2799
Binary files /dev/null and b/_static/img/basics/optimizationloops.png differ
diff --git a/_static/img/basics/typesdata.png b/_static/img/basics/typesdata.png
new file mode 100644
index 00000000000..5d0e0291eef
Binary files /dev/null and b/_static/img/basics/typesdata.png differ
diff --git a/_static/img/bert_mrpc.png b/_static/img/bert_mrpc.png
new file mode 100644
index 00000000000..fb0ff796f79
Binary files /dev/null and b/_static/img/bert_mrpc.png differ
diff --git a/_static/img/cartpole.gif b/_static/img/cartpole.gif
old mode 100644
new mode 100755
diff --git a/_static/img/cat.jpg b/_static/img/cat.jpg
new file mode 100755
index 00000000000..ccb575c02c3
Binary files /dev/null and b/_static/img/cat.jpg differ
diff --git a/_static/img/channels_last_memory_format.png b/_static/img/channels_last_memory_format.png
new file mode 100644
index 00000000000..d2b2922023f
Binary files /dev/null and b/_static/img/channels_last_memory_format.png differ
diff --git a/_static/img/char_rnn_generation.png b/_static/img/char_rnn_generation.png
old mode 100644
new mode 100755
diff --git a/_static/img/chat.png b/_static/img/chat.png
new file mode 100755
index 00000000000..02457c0697f
Binary files /dev/null and b/_static/img/chat.png differ
diff --git a/_static/img/chatbot/RNN-bidirectional.png b/_static/img/chatbot/RNN-bidirectional.png
new file mode 100755
index 00000000000..4dbec078f82
Binary files /dev/null and b/_static/img/chatbot/RNN-bidirectional.png differ
diff --git a/_static/img/chatbot/attn1.png b/_static/img/chatbot/attn1.png
new file mode 100755
index 00000000000..fe151c3cfb5
Binary files /dev/null and b/_static/img/chatbot/attn1.png differ
diff --git a/_static/img/chatbot/attn2.png b/_static/img/chatbot/attn2.png
new file mode 100755
index 00000000000..790f7430219
Binary files /dev/null and b/_static/img/chatbot/attn2.png differ
diff --git a/_static/img/chatbot/bot.png b/_static/img/chatbot/bot.png
new file mode 100755
index 00000000000..99502f546f0
Binary files /dev/null and b/_static/img/chatbot/bot.png differ
diff --git a/_static/img/chatbot/diff.png b/_static/img/chatbot/diff.png
new file mode 100644
index 00000000000..fc3cc56789b
Binary files /dev/null and b/_static/img/chatbot/diff.png differ
diff --git a/_static/img/chatbot/global_attn.png b/_static/img/chatbot/global_attn.png
new file mode 100755
index 00000000000..de18b75848a
Binary files /dev/null and b/_static/img/chatbot/global_attn.png differ
diff --git a/_static/img/chatbot/grad_clip.png b/_static/img/chatbot/grad_clip.png
new file mode 100755
index 00000000000..d9e5ca977b2
Binary files /dev/null and b/_static/img/chatbot/grad_clip.png differ
diff --git a/_static/img/chatbot/pytorch_workflow.png b/_static/img/chatbot/pytorch_workflow.png
new file mode 100644
index 00000000000..8a81f1995f0
Binary files /dev/null and b/_static/img/chatbot/pytorch_workflow.png differ
diff --git a/_static/img/chatbot/scores.png b/_static/img/chatbot/scores.png
new file mode 100755
index 00000000000..78f214f7508
Binary files /dev/null and b/_static/img/chatbot/scores.png differ
diff --git a/_static/img/chatbot/seq2seq_batches.png b/_static/img/chatbot/seq2seq_batches.png
new file mode 100755
index 00000000000..0543c445e2a
Binary files /dev/null and b/_static/img/chatbot/seq2seq_batches.png differ
diff --git a/_static/img/chatbot/seq2seq_ts.png b/_static/img/chatbot/seq2seq_ts.png
new file mode 100755
index 00000000000..f497297ac45
Binary files /dev/null and b/_static/img/chatbot/seq2seq_ts.png differ
diff --git a/_static/img/cifar10.png b/_static/img/cifar10.png
old mode 100644
new mode 100755
diff --git a/_static/img/classic_memory_format.png b/_static/img/classic_memory_format.png
new file mode 100644
index 00000000000..65cff010d88
Binary files /dev/null and b/_static/img/classic_memory_format.png differ
diff --git a/_static/img/compiled_autograd/call_hook_node.png b/_static/img/compiled_autograd/call_hook_node.png
new file mode 100644
index 00000000000..3e094cf6f73
Binary files /dev/null and b/_static/img/compiled_autograd/call_hook_node.png differ
diff --git a/_static/img/compiled_autograd/entire_verbose_log.png b/_static/img/compiled_autograd/entire_verbose_log.png
new file mode 100644
index 00000000000..4ce2b8538ee
Binary files /dev/null and b/_static/img/compiled_autograd/entire_verbose_log.png differ
diff --git a/_static/img/compiled_autograd/recompile_due_to_dynamic.png b/_static/img/compiled_autograd/recompile_due_to_dynamic.png
new file mode 100644
index 00000000000..41ae56acf2d
Binary files /dev/null and b/_static/img/compiled_autograd/recompile_due_to_dynamic.png differ
diff --git a/_static/img/compiled_autograd/recompile_due_to_node.png b/_static/img/compiled_autograd/recompile_due_to_node.png
new file mode 100644
index 00000000000..800a1784587
Binary files /dev/null and b/_static/img/compiled_autograd/recompile_due_to_node.png differ
diff --git a/_static/img/cpp-frontend/digits.png b/_static/img/cpp-frontend/digits.png
new file mode 100644
index 00000000000..0f7fa57543e
Binary files /dev/null and b/_static/img/cpp-frontend/digits.png differ
diff --git a/_static/img/cpp-pytorch.png b/_static/img/cpp-pytorch.png
new file mode 100644
index 00000000000..c19bcd8f964
Binary files /dev/null and b/_static/img/cpp-pytorch.png differ
diff --git a/_static/img/cpp_logo.png b/_static/img/cpp_logo.png
new file mode 100755
index 00000000000..432d4eb5a6e
Binary files /dev/null and b/_static/img/cpp_logo.png differ
diff --git a/_static/img/dag_autograd.png b/_static/img/dag_autograd.png
new file mode 100644
index 00000000000..cdc50fed625
Binary files /dev/null and b/_static/img/dag_autograd.png differ
diff --git a/_static/img/data_parallel.png b/_static/img/data_parallel.png
new file mode 100755
index 00000000000..5a729e9eae5
Binary files /dev/null and b/_static/img/data_parallel.png differ
diff --git a/_static/img/dcgan_generator.png b/_static/img/dcgan_generator.png
new file mode 100755
index 00000000000..9349577e4ea
Binary files /dev/null and b/_static/img/dcgan_generator.png differ
diff --git a/_static/img/deeplabv3_android.png b/_static/img/deeplabv3_android.png
new file mode 100644
index 00000000000..e0a451be8ef
Binary files /dev/null and b/_static/img/deeplabv3_android.png differ
diff --git a/_static/img/deeplabv3_android2.png b/_static/img/deeplabv3_android2.png
new file mode 100644
index 00000000000..0ae041479aa
Binary files /dev/null and b/_static/img/deeplabv3_android2.png differ
diff --git a/_static/img/deeplabv3_ios.png b/_static/img/deeplabv3_ios.png
new file mode 100644
index 00000000000..c901179e1ee
Binary files /dev/null and b/_static/img/deeplabv3_ios.png differ
diff --git a/_static/img/deeplabv3_ios2.png b/_static/img/deeplabv3_ios2.png
new file mode 100644
index 00000000000..3dc0073ca13
Binary files /dev/null and b/_static/img/deeplabv3_ios2.png differ
diff --git a/_static/img/distributed/DDP_memory.gif b/_static/img/distributed/DDP_memory.gif
new file mode 100644
index 00000000000..4049b9dba43
Binary files /dev/null and b/_static/img/distributed/DDP_memory.gif differ
diff --git a/_static/img/distributed/DistPyTorch.jpg b/_static/img/distributed/DistPyTorch.jpg
new file mode 100755
index 00000000000..65537a444f5
Binary files /dev/null and b/_static/img/distributed/DistPyTorch.jpg differ
diff --git a/_static/img/distributed/FSDP_autowrap.gif b/_static/img/distributed/FSDP_autowrap.gif
new file mode 100644
index 00000000000..d9e782d4c95
Binary files /dev/null and b/_static/img/distributed/FSDP_autowrap.gif differ
diff --git a/_static/img/distributed/FSDP_memory.gif b/_static/img/distributed/FSDP_memory.gif
new file mode 100644
index 00000000000..aece4e4b8a0
Binary files /dev/null and b/_static/img/distributed/FSDP_memory.gif differ
diff --git a/_static/img/distributed/all_gather.pdf b/_static/img/distributed/all_gather.pdf
new file mode 100755
index 00000000000..aadf4400432
Binary files /dev/null and b/_static/img/distributed/all_gather.pdf differ
diff --git a/_static/img/distributed/all_gather.png b/_static/img/distributed/all_gather.png
new file mode 100755
index 00000000000..6ccc912baf8
Binary files /dev/null and b/_static/img/distributed/all_gather.png differ
diff --git a/_static/img/distributed/all_reduce.pdf b/_static/img/distributed/all_reduce.pdf
new file mode 100755
index 00000000000..464574d8353
Binary files /dev/null and b/_static/img/distributed/all_reduce.pdf differ
diff --git a/_static/img/distributed/all_reduce.png b/_static/img/distributed/all_reduce.png
new file mode 100755
index 00000000000..789fb14739e
Binary files /dev/null and b/_static/img/distributed/all_reduce.png differ
diff --git a/_static/img/distributed/broadcast.png b/_static/img/distributed/broadcast.png
new file mode 100755
index 00000000000..3e615aeba5b
Binary files /dev/null and b/_static/img/distributed/broadcast.png differ
diff --git a/_static/img/distributed/device_mesh.png b/_static/img/distributed/device_mesh.png
new file mode 100644
index 00000000000..2ccabcc4824
Binary files /dev/null and b/_static/img/distributed/device_mesh.png differ
diff --git a/_static/img/distributed/distributed_checkpoint_generated_files.png b/_static/img/distributed/distributed_checkpoint_generated_files.png
new file mode 100644
index 00000000000..b32dddb7e62
Binary files /dev/null and b/_static/img/distributed/distributed_checkpoint_generated_files.png differ
diff --git a/_static/img/distributed/fsdp_implicit.png b/_static/img/distributed/fsdp_implicit.png
new file mode 100644
index 00000000000..85b19b7e72e
Binary files /dev/null and b/_static/img/distributed/fsdp_implicit.png differ
diff --git a/_static/img/distributed/fsdp_sharding.png b/_static/img/distributed/fsdp_sharding.png
new file mode 100755
index 00000000000..9dd1e3c111e
Binary files /dev/null and b/_static/img/distributed/fsdp_sharding.png differ
diff --git a/_static/img/distributed/fsdp_tp.png b/_static/img/distributed/fsdp_tp.png
new file mode 100644
index 00000000000..e419304ac7d
Binary files /dev/null and b/_static/img/distributed/fsdp_tp.png differ
diff --git a/_static/img/distributed/fsdp_workflow.png b/_static/img/distributed/fsdp_workflow.png
new file mode 100644
index 00000000000..1a8df0e44b6
Binary files /dev/null and b/_static/img/distributed/fsdp_workflow.png differ
diff --git a/_static/img/distributed/gather.png b/_static/img/distributed/gather.png
new file mode 100755
index 00000000000..195a1963830
Binary files /dev/null and b/_static/img/distributed/gather.png differ
diff --git a/_static/img/distributed/loss_parallel.png b/_static/img/distributed/loss_parallel.png
new file mode 100644
index 00000000000..b5cf9a499bc
Binary files /dev/null and b/_static/img/distributed/loss_parallel.png differ
diff --git a/_static/img/distributed/megatron_lm.png b/_static/img/distributed/megatron_lm.png
new file mode 100644
index 00000000000..38f7b06639f
Binary files /dev/null and b/_static/img/distributed/megatron_lm.png differ
diff --git a/_static/img/distributed/reduce.png b/_static/img/distributed/reduce.png
new file mode 100755
index 00000000000..a185ceb75c1
Binary files /dev/null and b/_static/img/distributed/reduce.png differ
diff --git a/_static/img/distributed/scatter.png b/_static/img/distributed/scatter.png
new file mode 100755
index 00000000000..140cd7c7d72
Binary files /dev/null and b/_static/img/distributed/scatter.png differ
diff --git a/_static/img/distributed/send_recv.png b/_static/img/distributed/send_recv.png
new file mode 100755
index 00000000000..d60e11e72fe
Binary files /dev/null and b/_static/img/distributed/send_recv.png differ
diff --git a/_static/img/distributed/send_recv_big.png b/_static/img/distributed/send_recv_big.png
new file mode 100755
index 00000000000..1ef9154c89e
Binary files /dev/null and b/_static/img/distributed/send_recv_big.png differ
diff --git a/_static/img/distributed/tcpstore_barrier_time.png b/_static/img/distributed/tcpstore_barrier_time.png
new file mode 100644
index 00000000000..5ece3a7471d
Binary files /dev/null and b/_static/img/distributed/tcpstore_barrier_time.png differ
diff --git a/_static/img/distributed/tcpstore_init_time.png b/_static/img/distributed/tcpstore_init_time.png
new file mode 100644
index 00000000000..df514b4dc48
Binary files /dev/null and b/_static/img/distributed/tcpstore_init_time.png differ
diff --git a/_static/img/dynamic_graph.gif b/_static/img/dynamic_graph.gif
old mode 100644
new mode 100755
diff --git a/_static/img/fgsm_panda_image.png b/_static/img/fgsm_panda_image.png
new file mode 100755
index 00000000000..c10a73b605f
Binary files /dev/null and b/_static/img/fgsm_panda_image.png differ
diff --git a/_static/img/flask.png b/_static/img/flask.png
new file mode 100644
index 00000000000..bad6738efdd
Binary files /dev/null and b/_static/img/flask.png differ
diff --git a/_static/img/half_cheetah.gif b/_static/img/half_cheetah.gif
new file mode 100644
index 00000000000..b61ff47d4e6
Binary files /dev/null and b/_static/img/half_cheetah.gif differ
diff --git a/_static/img/hta/comm_across_ranks.png b/_static/img/hta/comm_across_ranks.png
new file mode 100644
index 00000000000..2336de3bcbc
Binary files /dev/null and b/_static/img/hta/comm_across_ranks.png differ
diff --git a/_static/img/hta/counts_diff.png b/_static/img/hta/counts_diff.png
new file mode 100644
index 00000000000..34575c145de
Binary files /dev/null and b/_static/img/hta/counts_diff.png differ
diff --git a/_static/img/hta/cuda_kernel_launch.png b/_static/img/hta/cuda_kernel_launch.png
new file mode 100644
index 00000000000..e57c54a2fc5
Binary files /dev/null and b/_static/img/hta/cuda_kernel_launch.png differ
diff --git a/_static/img/hta/cuda_kernel_launch_stats.png b/_static/img/hta/cuda_kernel_launch_stats.png
new file mode 100644
index 00000000000..33a160fc752
Binary files /dev/null and b/_static/img/hta/cuda_kernel_launch_stats.png differ
diff --git a/_static/img/hta/duration_diff.png b/_static/img/hta/duration_diff.png
new file mode 100644
index 00000000000..050d491c872
Binary files /dev/null and b/_static/img/hta/duration_diff.png differ
diff --git a/_static/img/hta/idle_time.png b/_static/img/hta/idle_time.png
new file mode 100644
index 00000000000..782bfe9adb5
Binary files /dev/null and b/_static/img/hta/idle_time.png differ
diff --git a/_static/img/hta/idle_time_breakdown_percentage.png b/_static/img/hta/idle_time_breakdown_percentage.png
new file mode 100644
index 00000000000..3bab5946eab
Binary files /dev/null and b/_static/img/hta/idle_time_breakdown_percentage.png differ
diff --git a/_static/img/hta/idle_time_summary.png b/_static/img/hta/idle_time_summary.png
new file mode 100644
index 00000000000..101b696b534
Binary files /dev/null and b/_static/img/hta/idle_time_summary.png differ
diff --git a/_static/img/hta/kernel_metrics_df.png b/_static/img/hta/kernel_metrics_df.png
new file mode 100644
index 00000000000..53eefb58b0c
Binary files /dev/null and b/_static/img/hta/kernel_metrics_df.png differ
diff --git a/_static/img/hta/kernel_type_breakdown.png b/_static/img/hta/kernel_type_breakdown.png
new file mode 100644
index 00000000000..29a29cf89b2
Binary files /dev/null and b/_static/img/hta/kernel_type_breakdown.png differ
diff --git a/_static/img/hta/launch_delay_outliers.png b/_static/img/hta/launch_delay_outliers.png
new file mode 100644
index 00000000000..9bb455adea4
Binary files /dev/null and b/_static/img/hta/launch_delay_outliers.png differ
diff --git a/_static/img/hta/mem_bandwidth_queue_length.png b/_static/img/hta/mem_bandwidth_queue_length.png
new file mode 100644
index 00000000000..9df5383b5d9
Binary files /dev/null and b/_static/img/hta/mem_bandwidth_queue_length.png differ
diff --git a/_static/img/hta/overlap_df.png b/_static/img/hta/overlap_df.png
new file mode 100644
index 00000000000..ef164a28a12
Binary files /dev/null and b/_static/img/hta/overlap_df.png differ
diff --git a/_static/img/hta/overlap_plot.png b/_static/img/hta/overlap_plot.png
new file mode 100644
index 00000000000..acd449bc7ff
Binary files /dev/null and b/_static/img/hta/overlap_plot.png differ
diff --git a/_static/img/hta/pie_charts.png b/_static/img/hta/pie_charts.png
new file mode 100644
index 00000000000..fa9137109a6
Binary files /dev/null and b/_static/img/hta/pie_charts.png differ
diff --git a/_static/img/hta/queue_length_summary.png b/_static/img/hta/queue_length_summary.png
new file mode 100644
index 00000000000..639a03fb6d1
Binary files /dev/null and b/_static/img/hta/queue_length_summary.png differ
diff --git a/_static/img/hta/runtime_outliers.png b/_static/img/hta/runtime_outliers.png
new file mode 100644
index 00000000000..1e2dfff9006
Binary files /dev/null and b/_static/img/hta/runtime_outliers.png differ
diff --git a/_static/img/hta/short_gpu_kernels.png b/_static/img/hta/short_gpu_kernels.png
new file mode 100644
index 00000000000..ff382a3a7f0
Binary files /dev/null and b/_static/img/hta/short_gpu_kernels.png differ
diff --git a/_static/img/hta/temporal_breakdown_df.png b/_static/img/hta/temporal_breakdown_df.png
new file mode 100644
index 00000000000..dce1829d113
Binary files /dev/null and b/_static/img/hta/temporal_breakdown_df.png differ
diff --git a/_static/img/hta/temporal_breakdown_plot.png b/_static/img/hta/temporal_breakdown_plot.png
new file mode 100644
index 00000000000..9c5f45c1d35
Binary files /dev/null and b/_static/img/hta/temporal_breakdown_plot.png differ
diff --git a/_static/img/hybrid_frontend/220px-KnnClassification.png b/_static/img/hybrid_frontend/220px-KnnClassification.png
new file mode 100755
index 00000000000..fb9ee948f45
Binary files /dev/null and b/_static/img/hybrid_frontend/220px-KnnClassification.png differ
diff --git a/_static/img/hybrid_frontend/iris_pic.jpg b/_static/img/hybrid_frontend/iris_pic.jpg
new file mode 100755
index 00000000000..85bb9b49393
Binary files /dev/null and b/_static/img/hybrid_frontend/iris_pic.jpg differ
diff --git a/_static/img/hybrid_frontend/pytorch_workflow_small.jpg b/_static/img/hybrid_frontend/pytorch_workflow_small.jpg
new file mode 100755
index 00000000000..8b3d91b1df5
Binary files /dev/null and b/_static/img/hybrid_frontend/pytorch_workflow_small.jpg differ
diff --git a/_static/img/install_msvc.png b/_static/img/install_msvc.png
new file mode 100644
index 00000000000..fce73207a80
Binary files /dev/null and b/_static/img/install_msvc.png differ
diff --git a/_static/img/invpendulum.gif b/_static/img/invpendulum.gif
new file mode 100644
index 00000000000..3102c5b55cf
Binary files /dev/null and b/_static/img/invpendulum.gif differ
diff --git a/_static/img/itt_tutorial/vtune_config.png b/_static/img/itt_tutorial/vtune_config.png
new file mode 100755
index 00000000000..9f3c4605022
Binary files /dev/null and b/_static/img/itt_tutorial/vtune_config.png differ
diff --git a/_static/img/itt_tutorial/vtune_start.png b/_static/img/itt_tutorial/vtune_start.png
new file mode 100755
index 00000000000..9460df7c5f3
Binary files /dev/null and b/_static/img/itt_tutorial/vtune_start.png differ
diff --git a/_static/img/itt_tutorial/vtune_timeline.png b/_static/img/itt_tutorial/vtune_timeline.png
new file mode 100755
index 00000000000..1f1f018e3fa
Binary files /dev/null and b/_static/img/itt_tutorial/vtune_timeline.png differ
diff --git a/_static/img/itt_tutorial/vtune_xpu_config.png b/_static/img/itt_tutorial/vtune_xpu_config.png
new file mode 100644
index 00000000000..80dd1812d26
Binary files /dev/null and b/_static/img/itt_tutorial/vtune_xpu_config.png differ
diff --git a/_static/img/itt_tutorial/vtune_xpu_timeline.png b/_static/img/itt_tutorial/vtune_xpu_timeline.png
new file mode 100644
index 00000000000..43818cf105c
Binary files /dev/null and b/_static/img/itt_tutorial/vtune_xpu_timeline.png differ
diff --git a/_static/img/knowledge_distillation/ce_only.png b/_static/img/knowledge_distillation/ce_only.png
new file mode 100644
index 00000000000..a7503716575
Binary files /dev/null and b/_static/img/knowledge_distillation/ce_only.png differ
diff --git a/_static/img/knowledge_distillation/cosine_embedding_loss.png b/_static/img/knowledge_distillation/cosine_embedding_loss.png
new file mode 100644
index 00000000000..ebfd957a250
Binary files /dev/null and b/_static/img/knowledge_distillation/cosine_embedding_loss.png differ
diff --git a/_static/img/knowledge_distillation/cosine_loss_distillation.png b/_static/img/knowledge_distillation/cosine_loss_distillation.png
new file mode 100644
index 00000000000..81f241eb07f
Binary files /dev/null and b/_static/img/knowledge_distillation/cosine_loss_distillation.png differ
diff --git a/_static/img/knowledge_distillation/distillation_output_loss.png b/_static/img/knowledge_distillation/distillation_output_loss.png
new file mode 100644
index 00000000000..f86cbddbdfd
Binary files /dev/null and b/_static/img/knowledge_distillation/distillation_output_loss.png differ
diff --git a/_static/img/knowledge_distillation/fitnets_knowledge_distill.png b/_static/img/knowledge_distillation/fitnets_knowledge_distill.png
new file mode 100644
index 00000000000..407d9de89f6
Binary files /dev/null and b/_static/img/knowledge_distillation/fitnets_knowledge_distill.png differ
diff --git a/_static/img/landmarked_face2.png b/_static/img/landmarked_face2.png
old mode 100644
new mode 100755
diff --git a/_static/img/mario.gif b/_static/img/mario.gif
new file mode 100644
index 00000000000..95d8c0cb172
Binary files /dev/null and b/_static/img/mario.gif differ
diff --git a/_static/img/mario_env.png b/_static/img/mario_env.png
new file mode 100644
index 00000000000..b6fc09c3c8c
Binary files /dev/null and b/_static/img/mario_env.png differ
diff --git a/_static/img/memory_format_logo.png b/_static/img/memory_format_logo.png
new file mode 100644
index 00000000000..6d1043ed29a
Binary files /dev/null and b/_static/img/memory_format_logo.png differ
diff --git a/_static/img/mnist.png b/_static/img/mnist.png
index a85fc423984..53c876a89d5 100644
Binary files a/_static/img/mnist.png and b/_static/img/mnist.png differ
diff --git a/_static/img/model-parallel-images/mp_vs_rn.png b/_static/img/model-parallel-images/mp_vs_rn.png
new file mode 100644
index 00000000000..c56ec8adf51
Binary files /dev/null and b/_static/img/model-parallel-images/mp_vs_rn.png differ
diff --git a/_static/img/model-parallel-images/mp_vs_rn_vs_pp.png b/_static/img/model-parallel-images/mp_vs_rn_vs_pp.png
new file mode 100644
index 00000000000..a102c916771
Binary files /dev/null and b/_static/img/model-parallel-images/mp_vs_rn_vs_pp.png differ
diff --git a/_static/img/model-parallel-images/split_size_tradeoff.png b/_static/img/model-parallel-images/split_size_tradeoff.png
new file mode 100644
index 00000000000..f30eba44637
Binary files /dev/null and b/_static/img/model-parallel-images/split_size_tradeoff.png differ
diff --git a/_static/img/named_tensor.png b/_static/img/named_tensor.png
new file mode 100644
index 00000000000..2efceb9f516
Binary files /dev/null and b/_static/img/named_tensor.png differ
diff --git a/_static/img/neural-style/dancing.jpg b/_static/img/neural-style/dancing.jpg
old mode 100644
new mode 100755
diff --git a/_static/img/neural-style/neuralstyle.png b/_static/img/neural-style/neuralstyle.png
old mode 100644
new mode 100755
diff --git a/_static/img/neural-style/picasso.jpg b/_static/img/neural-style/picasso.jpg
old mode 100644
new mode 100755
diff --git a/_static/img/neural-style/sphx_glr_neural_style_tutorial_001.png b/_static/img/neural-style/sphx_glr_neural_style_tutorial_001.png
new file mode 100755
index 00000000000..cd86198719b
Binary files /dev/null and b/_static/img/neural-style/sphx_glr_neural_style_tutorial_001.png differ
diff --git a/_static/img/neural-style/sphx_glr_neural_style_tutorial_002.png b/_static/img/neural-style/sphx_glr_neural_style_tutorial_002.png
new file mode 100755
index 00000000000..75af8cb43e8
Binary files /dev/null and b/_static/img/neural-style/sphx_glr_neural_style_tutorial_002.png differ
diff --git a/_static/img/neural-style/sphx_glr_neural_style_tutorial_003.png b/_static/img/neural-style/sphx_glr_neural_style_tutorial_003.png
new file mode 100755
index 00000000000..70cf7fe2cf7
Binary files /dev/null and b/_static/img/neural-style/sphx_glr_neural_style_tutorial_003.png differ
diff --git a/_static/img/neural-style/sphx_glr_neural_style_tutorial_004.png b/_static/img/neural-style/sphx_glr_neural_style_tutorial_004.png
new file mode 100755
index 00000000000..2168b343341
Binary files /dev/null and b/_static/img/neural-style/sphx_glr_neural_style_tutorial_004.png differ
diff --git a/_static/img/nvfuser_intro/nvfuser_transformer_block.png b/_static/img/nvfuser_intro/nvfuser_transformer_block.png
new file mode 100755
index 00000000000..8dd88bbdf94
Binary files /dev/null and b/_static/img/nvfuser_intro/nvfuser_transformer_block.png differ
diff --git a/_static/img/nvfuser_intro/nvfuser_tutorial_0.png b/_static/img/nvfuser_intro/nvfuser_tutorial_0.png
new file mode 100755
index 00000000000..d3448d192bc
Binary files /dev/null and b/_static/img/nvfuser_intro/nvfuser_tutorial_0.png differ
diff --git a/_static/img/nvfuser_intro/nvfuser_tutorial_1.png b/_static/img/nvfuser_intro/nvfuser_tutorial_1.png
new file mode 100755
index 00000000000..4752695fa91
Binary files /dev/null and b/_static/img/nvfuser_intro/nvfuser_tutorial_1.png differ
diff --git a/_static/img/nvfuser_intro/nvfuser_tutorial_2.png b/_static/img/nvfuser_intro/nvfuser_tutorial_2.png
new file mode 100755
index 00000000000..ec45793d67d
Binary files /dev/null and b/_static/img/nvfuser_intro/nvfuser_tutorial_2.png differ
diff --git a/_static/img/nvfuser_intro/nvfuser_tutorial_3.png b/_static/img/nvfuser_intro/nvfuser_tutorial_3.png
new file mode 100755
index 00000000000..be529d93259
Binary files /dev/null and b/_static/img/nvfuser_intro/nvfuser_tutorial_3.png differ
diff --git a/_static/img/nvfuser_intro/nvfuser_tutorial_4.png b/_static/img/nvfuser_intro/nvfuser_tutorial_4.png
new file mode 100755
index 00000000000..f2e7c3ff339
Binary files /dev/null and b/_static/img/nvfuser_intro/nvfuser_tutorial_4.png differ
diff --git a/_static/img/nvfuser_intro/nvfuser_tutorial_5.png b/_static/img/nvfuser_intro/nvfuser_tutorial_5.png
new file mode 100755
index 00000000000..efe43d73741
Binary files /dev/null and b/_static/img/nvfuser_intro/nvfuser_tutorial_5.png differ
diff --git a/_static/img/nvfuser_intro/nvfuser_tutorial_6.png b/_static/img/nvfuser_intro/nvfuser_tutorial_6.png
new file mode 100755
index 00000000000..59a54a87f86
Binary files /dev/null and b/_static/img/nvfuser_intro/nvfuser_tutorial_6.png differ
diff --git a/_static/img/oneworker.png b/_static/img/oneworker.png
new file mode 100644
index 00000000000..255ec584834
Binary files /dev/null and b/_static/img/oneworker.png differ
diff --git a/_static/img/onnx/image_classifier_onnx_model_on_netron_web_ui.png b/_static/img/onnx/image_classifier_onnx_model_on_netron_web_ui.png
new file mode 100644
index 00000000000..6430e4943ff
Binary files /dev/null and b/_static/img/onnx/image_classifier_onnx_model_on_netron_web_ui.png differ
diff --git a/_static/img/onnx/netron_web_ui.png b/_static/img/onnx/netron_web_ui.png
new file mode 100755
index 00000000000..f88936eb824
Binary files /dev/null and b/_static/img/onnx/netron_web_ui.png differ
diff --git a/_static/img/optim_step_in_bwd/snapshot.jpg b/_static/img/optim_step_in_bwd/snapshot.jpg
new file mode 100644
index 00000000000..50be55e7b9a
Binary files /dev/null and b/_static/img/optim_step_in_bwd/snapshot.jpg differ
diff --git a/_static/img/optim_step_in_bwd/snapshot_opt_in_bwd.jpg b/_static/img/optim_step_in_bwd/snapshot_opt_in_bwd.jpg
new file mode 100644
index 00000000000..65d53d21c38
Binary files /dev/null and b/_static/img/optim_step_in_bwd/snapshot_opt_in_bwd.jpg differ
diff --git a/_static/img/panda.png b/_static/img/panda.png
new file mode 100755
index 00000000000..dd717fec882
Binary files /dev/null and b/_static/img/panda.png differ
diff --git a/_static/img/pendulum.gif b/_static/img/pendulum.gif
new file mode 100644
index 00000000000..a7adf181fc8
Binary files /dev/null and b/_static/img/pendulum.gif differ
diff --git a/_static/img/per_channel_quant.png b/_static/img/per_channel_quant.png
new file mode 100644
index 00000000000..e28810aca3e
Binary files /dev/null and b/_static/img/per_channel_quant.png differ
diff --git a/_static/img/per_tensor_quant.png b/_static/img/per_tensor_quant.png
new file mode 100644
index 00000000000..183bab6fa3b
Binary files /dev/null and b/_static/img/per_tensor_quant.png differ
diff --git a/_static/img/perf_viz.png b/_static/img/perf_viz.png
new file mode 100644
index 00000000000..85608557bcb
Binary files /dev/null and b/_static/img/perf_viz.png differ
diff --git a/_static/img/pinmem/pinmem.png b/_static/img/pinmem/pinmem.png
new file mode 100644
index 00000000000..9d84e9d229d
Binary files /dev/null and b/_static/img/pinmem/pinmem.png differ
diff --git a/_static/img/pinmem/trace_streamed0_pinned0.png b/_static/img/pinmem/trace_streamed0_pinned0.png
new file mode 100644
index 00000000000..dedac997b0b
Binary files /dev/null and b/_static/img/pinmem/trace_streamed0_pinned0.png differ
diff --git a/_static/img/pinmem/trace_streamed0_pinned1.png b/_static/img/pinmem/trace_streamed0_pinned1.png
new file mode 100644
index 00000000000..2d5ff462e1a
Binary files /dev/null and b/_static/img/pinmem/trace_streamed0_pinned1.png differ
diff --git a/_static/img/pinmem/trace_streamed1_pinned0.png b/_static/img/pinmem/trace_streamed1_pinned0.png
new file mode 100644
index 00000000000..130182a1978
Binary files /dev/null and b/_static/img/pinmem/trace_streamed1_pinned0.png differ
diff --git a/_static/img/pinmem/trace_streamed1_pinned1.png b/_static/img/pinmem/trace_streamed1_pinned1.png
new file mode 100644
index 00000000000..c596fcdb691
Binary files /dev/null and b/_static/img/pinmem/trace_streamed1_pinned1.png differ
diff --git a/_static/img/profiler_callstack.png b/_static/img/profiler_callstack.png
new file mode 100644
index 00000000000..835673ba63c
Binary files /dev/null and b/_static/img/profiler_callstack.png differ
diff --git a/_static/img/profiler_distributed_view.png b/_static/img/profiler_distributed_view.png
new file mode 100644
index 00000000000..2b0d5565131
Binary files /dev/null and b/_static/img/profiler_distributed_view.png differ
diff --git a/_static/img/profiler_kernel_view.png b/_static/img/profiler_kernel_view.png
new file mode 100644
index 00000000000..cfe01b83a0d
Binary files /dev/null and b/_static/img/profiler_kernel_view.png differ
diff --git a/_static/img/profiler_memory_curve_selecting.png b/_static/img/profiler_memory_curve_selecting.png
new file mode 100644
index 00000000000..b5dc0c10e9b
Binary files /dev/null and b/_static/img/profiler_memory_curve_selecting.png differ
diff --git a/_static/img/profiler_memory_curve_single.png b/_static/img/profiler_memory_curve_single.png
new file mode 100644
index 00000000000..c12d480ac40
Binary files /dev/null and b/_static/img/profiler_memory_curve_single.png differ
diff --git a/_static/img/profiler_memory_view.png b/_static/img/profiler_memory_view.png
new file mode 100644
index 00000000000..4839505ab8c
Binary files /dev/null and b/_static/img/profiler_memory_view.png differ
diff --git a/_static/img/profiler_operator_view.png b/_static/img/profiler_operator_view.png
new file mode 100644
index 00000000000..e3e60b03025
Binary files /dev/null and b/_static/img/profiler_operator_view.png differ
diff --git a/_static/img/profiler_overview1.png b/_static/img/profiler_overview1.png
new file mode 100644
index 00000000000..01eef8fda68
Binary files /dev/null and b/_static/img/profiler_overview1.png differ
diff --git a/_static/img/profiler_overview2.png b/_static/img/profiler_overview2.png
new file mode 100644
index 00000000000..cc7826b352a
Binary files /dev/null and b/_static/img/profiler_overview2.png differ
diff --git a/_static/img/profiler_rocm_chrome_trace_view.png b/_static/img/profiler_rocm_chrome_trace_view.png
new file mode 100644
index 00000000000..cff7ba98c8a
Binary files /dev/null and b/_static/img/profiler_rocm_chrome_trace_view.png differ
diff --git a/_static/img/profiler_rocm_tensorboard_operartor_view.png b/_static/img/profiler_rocm_tensorboard_operartor_view.png
new file mode 100644
index 00000000000..27effb91e7c
Binary files /dev/null and b/_static/img/profiler_rocm_tensorboard_operartor_view.png differ
diff --git a/_static/img/profiler_trace_view1.png b/_static/img/profiler_trace_view1.png
new file mode 100644
index 00000000000..215fe03e724
Binary files /dev/null and b/_static/img/profiler_trace_view1.png differ
diff --git a/_static/img/profiler_trace_view2.png b/_static/img/profiler_trace_view2.png
new file mode 100644
index 00000000000..790ef5d58ea
Binary files /dev/null and b/_static/img/profiler_trace_view2.png differ
diff --git a/_static/img/profiler_trace_view_fwd_bwd.png b/_static/img/profiler_trace_view_fwd_bwd.png
new file mode 100644
index 00000000000..c773b829e5d
Binary files /dev/null and b/_static/img/profiler_trace_view_fwd_bwd.png differ
diff --git a/_static/img/profiler_views_list.png b/_static/img/profiler_views_list.png
new file mode 100644
index 00000000000..040f392e366
Binary files /dev/null and b/_static/img/profiler_views_list.png differ
diff --git a/_static/img/profiler_vscode.png b/_static/img/profiler_vscode.png
new file mode 100644
index 00000000000..afb99f06937
Binary files /dev/null and b/_static/img/profiler_vscode.png differ
diff --git a/_static/img/pruning.png b/_static/img/pruning.png
new file mode 100644
index 00000000000..7359f11e9a6
Binary files /dev/null and b/_static/img/pruning.png differ
diff --git a/_static/img/pruning_flow.jpg b/_static/img/pruning_flow.jpg
new file mode 100644
index 00000000000..bd57158b302
Binary files /dev/null and b/_static/img/pruning_flow.jpg differ
diff --git a/_static/img/python_extension_autoload_impl.png b/_static/img/python_extension_autoload_impl.png
new file mode 100644
index 00000000000..64e18fc7b4b
Binary files /dev/null and b/_static/img/python_extension_autoload_impl.png differ
diff --git a/_static/img/pytorch-logo-dark.png b/_static/img/pytorch-logo-dark.png
old mode 100644
new mode 100755
index 0288a564e22..7992605b01f
Binary files a/_static/img/pytorch-logo-dark.png and b/_static/img/pytorch-logo-dark.png differ
diff --git a/_static/img/pytorch-logo-dark.svg b/_static/img/pytorch-logo-dark.svg
old mode 100644
new mode 100755
index 717a3ce942f..5e530003858
--- a/_static/img/pytorch-logo-dark.svg
+++ b/_static/img/pytorch-logo-dark.svg
@@ -1,24 +1,33 @@
 <?xml version="1.0" encoding="utf-8"?>
-<!-- Generator: Adobe Illustrator 21.0.0, SVG Export Plug-In . SVG Version: 6.00 Build 0)  -->
+<!-- Generator: Adobe Illustrator 22.1.0, SVG Export Plug-In . SVG Version: 6.00 Build 0)  -->
 <svg version="1.1" id="Layer_1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px" y="0px"
 	 viewBox="0 0 199.7 40.2" style="enable-background:new 0 0 199.7 40.2;" xml:space="preserve">
 <style type="text/css">
-	.st0{fill:#F05732;}
-	.st1{fill:#9E529F;}
-	.st2{fill:#333333;}
+	.st0{fill:#EE4C2C;}
+	.st1{fill:#252525;}
 </style>
-<path class="st0" d="M102.7,12.2c-1.3-1-1.8,3.9-4.4,3.9c-3,0-4-13-6.3-13c-0.7,0-0.8-0.4-7.9,21.3c-2.9,9,4.4,15.8,11.8,15.8
-	c4.6,0,12.3-3,12.3-12.6C108.2,20.5,104.7,13.7,102.7,12.2z M95.8,35.3c-3.7,0-6.7-3.1-6.7-7c0-3.9,3-7,6.7-7s6.7,3.1,6.7,7
-	C102.5,32.1,99.5,35.3,95.8,35.3z"/>
-<path class="st1" d="M99.8,0c-0.5,0-1.8,2.5-1.8,3.6c0,1.5,1,2,1.8,2c0.8,0,1.8-0.5,1.8-2C101.5,2.5,100.2,0,99.8,0z"/>
-<path class="st2" d="M0,39.5V14.9h11.5c5.3,0,8.3,3.6,8.3,7.9c0,4.3-3,7.9-8.3,7.9H5.2v8.8H0z M14.4,22.8c0-2.1-1.6-3.3-3.7-3.3H5.2
-	v6.6h5.5C12.8,26.1,14.4,24.8,14.4,22.8z"/>
-<path class="st2" d="M35.2,39.5V29.4l-9.4-14.5h6l6.1,9.8l6.1-9.8h5.9l-9.4,14.5v10.1H35.2z"/>
-<path class="st2" d="M63.3,39.5v-20h-7.2v-4.6h19.6v4.6h-7.2v20H63.3z"/>
-<path class="st2" d="M131.4,39.5l-4.8-8.7h-3.8v8.7h-5.2V14.9H129c5.1,0,8.3,3.4,8.3,7.9c0,4.3-2.8,6.7-5.4,7.3l5.6,9.4H131.4z
-	 M131.9,22.8c0-2-1.6-3.3-3.7-3.3h-5.5v6.6h5.5C130.3,26.1,131.9,24.9,131.9,22.8z"/>
-<path class="st2" d="M145.6,27.2c0-7.6,5.7-12.7,13.1-12.7c5.4,0,8.5,2.9,10.3,6l-4.5,2.2c-1-2-3.2-3.6-5.8-3.6
-	c-4.5,0-7.7,3.4-7.7,8.1c0,4.6,3.2,8.1,7.7,8.1c2.5,0,4.7-1.6,5.8-3.6l4.5,2.2c-1.7,3.1-4.9,6-10.3,6
-	C151.3,39.9,145.6,34.7,145.6,27.2z"/>
-<path class="st2" d="M194.5,39.5V29.1h-11.6v10.4h-5.2V14.9h5.2v9.7h11.6v-9.7h5.3v24.6H194.5z"/>
+<g>
+	<path class="st0" d="M40.8,9.3l-2.1,2.1c3.5,3.5,3.5,9.2,0,12.7c-3.5,3.5-9.2,3.5-12.7,0c-3.5-3.5-3.5-9.2,0-12.7l0,0l5.6-5.6
+		L32.3,5l0,0V0.8l-8.5,8.5c-4.7,4.7-4.7,12.2,0,16.9s12.2,4.7,16.9,0C45.5,21.5,45.5,13.9,40.8,9.3z"/>
+	<circle class="st0" cx="36.6" cy="7.1" r="1.6"/>
+</g>
+<g>
+	<g>
+		<path class="st1" d="M62.6,20l-3.6,0v9.3h-2.7V2.9c0,0,6.3,0,6.6,0c7,0,10.3,3.4,10.3,8.3C73.2,17,69.1,19.9,62.6,20z M62.8,5.4
+			c-0.3,0-3.9,0-3.9,0v12.1l3.8-0.1c5-0.1,7.7-2.1,7.7-6.2C70.4,7.5,67.8,5.4,62.8,5.4z"/>
+		<path class="st1" d="M85.4,29.2l-1.6,4.2c-1.8,4.7-3.6,6.1-6.3,6.1c-1.5,0-2.6-0.4-3.8-0.9l0.8-2.4c0.9,0.5,1.9,0.8,3,0.8
+			c1.5,0,2.6-0.8,4-4.5l1.3-3.4L75.3,10h2.8l6.1,16l6-16h2.7L85.4,29.2z"/>
+		<path class="st1" d="M101.9,5.5v23.9h-2.7V5.5h-9.3V2.9h21.3v2.5H101.9z"/>
+		<path class="st1" d="M118.8,29.9c-5.4,0-9.4-4-9.4-10.2c0-6.2,4.1-10.3,9.6-10.3c5.4,0,9.3,4,9.3,10.2
+			C128.3,25.8,124.2,29.9,118.8,29.9z M118.9,11.8c-4.1,0-6.8,3.3-6.8,7.8c0,4.7,2.8,7.9,6.9,7.9s6.8-3.3,6.8-7.8
+			C125.8,15,123,11.8,118.9,11.8z"/>
+		<path class="st1" d="M135,29.4h-2.6V10l2.6-0.5v4.1c1.3-2.5,3.2-4.1,5.7-4.1c1.3,0,2.5,0.4,3.4,0.9l-0.7,2.5
+			c-0.8-0.5-1.9-0.8-3-0.8c-2,0-3.9,1.5-5.5,5V29.4z"/>
+		<path class="st1" d="M154.4,29.9c-5.8,0-9.5-4.2-9.5-10.2c0-6.1,4-10.3,9.5-10.3c2.4,0,4.4,0.6,6.1,1.7l-0.7,2.4
+			c-1.5-1-3.3-1.6-5.4-1.6c-4.2,0-6.8,3.1-6.8,7.7c0,4.7,2.8,7.8,6.9,7.8c1.9,0,3.9-0.6,5.4-1.6l0.5,2.4
+			C158.7,29.3,156.6,29.9,154.4,29.9z"/>
+		<path class="st1" d="M176.7,29.4V16.9c0-3.4-1.4-4.9-4.1-4.9c-2.2,0-4.4,1.1-6,2.8v14.7h-2.6V0.9l2.6-0.5c0,0,0,12.1,0,12.2
+			c2-2,4.6-3.1,6.7-3.1c3.8,0,6.1,2.4,6.1,6.6v13.3H176.7z"/>
+	</g>
+</g>
 </svg>
diff --git a/_static/img/qat.png b/_static/img/qat.png
new file mode 100644
index 00000000000..e8ca311745c
Binary files /dev/null and b/_static/img/qat.png differ
diff --git a/_static/img/quant_embeddings.png b/_static/img/quant_embeddings.png
new file mode 100644
index 00000000000..035561465a2
Binary files /dev/null and b/_static/img/quant_embeddings.png differ
diff --git a/_static/img/ray-tune.png b/_static/img/ray-tune.png
new file mode 100644
index 00000000000..febd6de282e
Binary files /dev/null and b/_static/img/ray-tune.png differ
diff --git a/_static/img/reinforcement_learning_diagram.drawio b/_static/img/reinforcement_learning_diagram.drawio
new file mode 100644
index 00000000000..2ff4e6f0270
--- /dev/null
+++ b/_static/img/reinforcement_learning_diagram.drawio
@@ -0,0 +1 @@
+<mxfile host="app.diagrams.net" modified="2022-10-01T16:00:40.980Z" agent="5.0 (X11)" etag="_qbqVrrm3wUvm_i0-Q9T" version="20.4.0" type="device"><diagram id="aSXDm0BvLjt-Za0vl2Tv" name="Page-1">5Vpbc+MmFP41nmkfmpGEpMiPjTftzrTZZtbbbbYvHSxhiRQJFeHb/vqChG4gx95ElqfTeCaGwwEO37lwDskMLNL9zwzmyQONEJk5VrSfgXczx7GtuSW+JOVQUXzXrggxw5FiaglL/BXVMxV1gyNU9Bg5pYTjvE8MaZahkPdokDG667OtKenvmsMYGYRlCIlJ/QNHPKmogWe19PcIx0m9s22pkRTWzIpQJDCiuw4J3M/AglHKq1a6XyAiwatxqeb9dGS0EYyhjJ8z4flQfCient3PwfKXVfTn40P6/vMPapUtJBt14I8oJ/AgaA8opeygZOeHGhBGN1mE5JrWDNztEszRMoehHN0JExC0hKdE9GzRXGNCFpRQVs4Fa09+BD1mMMJC7s7YqvyIsYIz+jfqjPjljxhRsiLG0f4oCHYDrbBJRFPE5SEsNcGt1aPMMVDdXatbu1ZY0tGrr2hQmVPcrNwiLhoK9G9QgGMo4Lec41T6gWN9535v4C/WFLaPTmM/AlrebR8t2z0TruBScAEDLgaziKaCtoI8TAy4wg3bltYqIUFZ9KOMCaIbElgUOOxD1rftai0UGQFCg0/sRzcsRKfdjEMWI37KGkx1dOD2BtCuaQwRyPG2L+6QCtQOjxSLgxz1DRdoWqyOqWZ1I42+kNdfCOjmUOFgLCR0U0aemi2XDMVxgQ3ztK0X5fJtjR/0+EWjkqA1z0YHr7dYz7DYR0pwKM/5AfFRw2sEUbAOh8PrLYI+sgbDaxig1foy4dWxrh1fAzOACodeqi5lPKExzSC5b6laGGh5fqU0V8g/I84PKluBG06HQu8okcN/W+Q4OyS8CWTfsPFPlbz/Cxu/eg5hm0nEmPcg2mP+JNs3nup96Yy823c7h/HvTu8/cXfqV9H8lXen5xxJuUa+O91A2yd4+e709LvTP8Hvvsh/mbvWNtPDUR0hE+I9NVNF50vrFrLb+kLZq52hcaCO+9hTuY9zpvt413Qf3Vqc29e6z1xbyJkm9TSKb51fS4mdKVJP2zXc4fc8grwsLb3rlpaODog3cItaU96ijUdNmirWscHuXq03jjdVeLDnZ8aHI+qcJsGspewFdp8Iee8ivJU7Ehxn5YD/z0a+qN0RtOZtT7Ri9Q1Tac3ZqsjLvvWJQZzhLBbNUmtqXSFnuXQzb5zd7Bvxa5FQWkgvbB4vJDxCgXVlCEOOaXZhURwpylJQiRQFZdsL7wfkfh9RSFkkGql6XrQ2KiRddG9X7t2+rF10L6/ElUpu5VZ/ZWUt1D/piuk76/K8pWyq5S+lHiVi23oGaA9E7PlAxG4Yxw/ZZr4X1q5Vu9AE6V8wP5UAyt4jYlgcG7HrlUhVGL1WkgeO5EDf/r5oDdcuo9dIeqUPXk7ygK/xn3iPNACxJkgKHTMpHNVJBmod6+Z2snzmqmWMrlCgVx/nWjjQLc+7jIUDvYw5ZeFA43emsFjzCf0iYd2ava6q7z2LTVbX18XdyaDvX9UjNIMBevl2tkdo71VATyrG8ghd4LcV6qLb/oW/Ym//TwLc/ws=</diagram></mxfile>
\ No newline at end of file
diff --git a/_static/img/reinforcement_learning_diagram.jpg b/_static/img/reinforcement_learning_diagram.jpg
new file mode 100644
index 00000000000..7e04efc2534
Binary files /dev/null and b/_static/img/reinforcement_learning_diagram.jpg differ
diff --git a/_static/img/replaybuffer_traj.png b/_static/img/replaybuffer_traj.png
new file mode 100644
index 00000000000..64773ee8f78
Binary files /dev/null and b/_static/img/replaybuffer_traj.png differ
diff --git a/_static/img/rnnclass.png b/_static/img/rnnclass.png
new file mode 100755
index 00000000000..ff34c9e63ee
Binary files /dev/null and b/_static/img/rnnclass.png differ
diff --git a/_static/img/rollout_recurrent.png b/_static/img/rollout_recurrent.png
new file mode 100644
index 00000000000..2ce24d40d23
Binary files /dev/null and b/_static/img/rollout_recurrent.png differ
diff --git a/_static/img/rpc-images/batch.png b/_static/img/rpc-images/batch.png
new file mode 100644
index 00000000000..cde410d1bd1
Binary files /dev/null and b/_static/img/rpc-images/batch.png differ
diff --git a/_static/img/rpc_trace_img.png b/_static/img/rpc_trace_img.png
new file mode 100644
index 00000000000..4faaf97ad47
Binary files /dev/null and b/_static/img/rpc_trace_img.png differ
diff --git a/_static/img/sample_file.jpeg b/_static/img/sample_file.jpeg
new file mode 100644
index 00000000000..a7b314bd969
Binary files /dev/null and b/_static/img/sample_file.jpeg differ
diff --git a/_static/img/scipynumpy.png b/_static/img/scipynumpy.png
new file mode 100755
index 00000000000..d730af1a4f4
Binary files /dev/null and b/_static/img/scipynumpy.png differ
diff --git a/_static/img/seq-seq-images/attention-decoder-network.png b/_static/img/seq-seq-images/attention-decoder-network.png
index 243f87c6e97..d31d42a5af1 100755
Binary files a/_static/img/seq-seq-images/attention-decoder-network.png and b/_static/img/seq-seq-images/attention-decoder-network.png differ
diff --git a/_static/img/seq2seq_flat.png b/_static/img/seq2seq_flat.png
old mode 100644
new mode 100755
diff --git a/_static/img/steam-train-whistle-daniel_simon-converted-from-mp3.wav b/_static/img/steam-train-whistle-daniel_simon-converted-from-mp3.wav
new file mode 100644
index 00000000000..3f899c9dadd
Binary files /dev/null and b/_static/img/steam-train-whistle-daniel_simon-converted-from-mp3.wav differ
diff --git a/_static/img/stn/FSeq.png b/_static/img/stn/FSeq.png
new file mode 100755
index 00000000000..8bdd84acdf9
Binary files /dev/null and b/_static/img/stn/FSeq.png differ
diff --git a/_static/img/stn/Five.gif b/_static/img/stn/Five.gif
new file mode 100755
index 00000000000..5a1939933d3
Binary files /dev/null and b/_static/img/stn/Five.gif differ
diff --git a/_static/img/stn/stn-arch.png b/_static/img/stn/stn-arch.png
new file mode 100755
index 00000000000..670c99fbfdd
Binary files /dev/null and b/_static/img/stn/stn-arch.png differ
diff --git a/_static/img/stn/tr.png b/_static/img/stn/tr.png
new file mode 100755
index 00000000000..f80d2d8b093
Binary files /dev/null and b/_static/img/stn/tr.png differ
diff --git a/_static/img/tensor_illustration.png b/_static/img/tensor_illustration.png
old mode 100644
new mode 100755
diff --git a/_static/img/tensor_illustration_flat.png b/_static/img/tensor_illustration_flat.png
old mode 100644
new mode 100755
diff --git a/_static/img/tensorboard_figure.png b/_static/img/tensorboard_figure.png
new file mode 100644
index 00000000000..e4dd38e98da
Binary files /dev/null and b/_static/img/tensorboard_figure.png differ
diff --git a/_static/img/tensorboard_first_view.png b/_static/img/tensorboard_first_view.png
new file mode 100644
index 00000000000..702c8158a82
Binary files /dev/null and b/_static/img/tensorboard_first_view.png differ
diff --git a/_static/img/tensorboard_images.png b/_static/img/tensorboard_images.png
new file mode 100644
index 00000000000..79b7e15a7af
Binary files /dev/null and b/_static/img/tensorboard_images.png differ
diff --git a/_static/img/tensorboard_model_viz.png b/_static/img/tensorboard_model_viz.png
new file mode 100644
index 00000000000..2cd22344f15
Binary files /dev/null and b/_static/img/tensorboard_model_viz.png differ
diff --git a/_static/img/tensorboard_pr_curves.png b/_static/img/tensorboard_pr_curves.png
new file mode 100644
index 00000000000..0360187eae6
Binary files /dev/null and b/_static/img/tensorboard_pr_curves.png differ
diff --git a/_static/img/tensorboard_projector.png b/_static/img/tensorboard_projector.png
new file mode 100644
index 00000000000..f709efc32d0
Binary files /dev/null and b/_static/img/tensorboard_projector.png differ
diff --git a/_static/img/tensorboard_scalar_runs.png b/_static/img/tensorboard_scalar_runs.png
new file mode 100644
index 00000000000..f89ace713aa
Binary files /dev/null and b/_static/img/tensorboard_scalar_runs.png differ
diff --git a/_static/img/text_sentiment_ngrams_model.png b/_static/img/text_sentiment_ngrams_model.png
new file mode 100644
index 00000000000..94fdf554047
Binary files /dev/null and b/_static/img/text_sentiment_ngrams_model.png differ
diff --git a/_static/img/thumbnails/220px-KnnClassification.png b/_static/img/thumbnails/220px-KnnClassification.png
new file mode 100755
index 00000000000..fb9ee948f45
Binary files /dev/null and b/_static/img/thumbnails/220px-KnnClassification.png differ
diff --git a/_static/img/thumbnails/babel.jpg b/_static/img/thumbnails/babel.jpg
old mode 100644
new mode 100755
diff --git a/_static/img/thumbnails/captum_teaser.png b/_static/img/thumbnails/captum_teaser.png
new file mode 100644
index 00000000000..c7fcb2c093a
Binary files /dev/null and b/_static/img/thumbnails/captum_teaser.png differ
diff --git a/_static/img/thumbnails/cropped/60-min-blitz.png b/_static/img/thumbnails/cropped/60-min-blitz.png
new file mode 100644
index 00000000000..681a16d9935
Binary files /dev/null and b/_static/img/thumbnails/cropped/60-min-blitz.png differ
diff --git a/_static/img/thumbnails/cropped/Adversarial-Example-Generation.png b/_static/img/thumbnails/cropped/Adversarial-Example-Generation.png
new file mode 100644
index 00000000000..ad5014e805c
Binary files /dev/null and b/_static/img/thumbnails/cropped/Adversarial-Example-Generation.png differ
diff --git a/_static/img/thumbnails/cropped/Autograd-in-Cpp-Frontend.png b/_static/img/thumbnails/cropped/Autograd-in-Cpp-Frontend.png
new file mode 100644
index 00000000000..3aec75031ae
Binary files /dev/null and b/_static/img/thumbnails/cropped/Autograd-in-Cpp-Frontend.png differ
diff --git a/_static/img/thumbnails/cropped/Combining-Distributed-DataParallel-with-Distributed-RPC-Framework.png b/_static/img/thumbnails/cropped/Combining-Distributed-DataParallel-with-Distributed-RPC-Framework.png
new file mode 100644
index 00000000000..426a14d98f5
Binary files /dev/null and b/_static/img/thumbnails/cropped/Combining-Distributed-DataParallel-with-Distributed-RPC-Framework.png differ
diff --git a/_static/img/thumbnails/cropped/Custom-Cpp-and-CUDA-Extensions.png b/_static/img/thumbnails/cropped/Custom-Cpp-and-CUDA-Extensions.png
new file mode 100644
index 00000000000..426a14d98f5
Binary files /dev/null and b/_static/img/thumbnails/cropped/Custom-Cpp-and-CUDA-Extensions.png differ
diff --git a/_static/img/thumbnails/cropped/Customize-Process-Group-Backends-Using-Cpp-Extensions.png b/_static/img/thumbnails/cropped/Customize-Process-Group-Backends-Using-Cpp-Extensions.png
new file mode 100644
index 00000000000..426a14d98f5
Binary files /dev/null and b/_static/img/thumbnails/cropped/Customize-Process-Group-Backends-Using-Cpp-Extensions.png differ
diff --git a/_static/img/thumbnails/cropped/DCGAN-Tutorial.png b/_static/img/thumbnails/cropped/DCGAN-Tutorial.png
new file mode 100644
index 00000000000..a0c89096e9a
Binary files /dev/null and b/_static/img/thumbnails/cropped/DCGAN-Tutorial.png differ
diff --git a/_static/img/thumbnails/cropped/Deploying-PyTorch-in-Python-via-a-REST-API-with-Flask.png b/_static/img/thumbnails/cropped/Deploying-PyTorch-in-Python-via-a-REST-API-with-Flask.png
new file mode 100644
index 00000000000..426a14d98f5
Binary files /dev/null and b/_static/img/thumbnails/cropped/Deploying-PyTorch-in-Python-via-a-REST-API-with-Flask.png differ
diff --git a/_static/img/thumbnails/cropped/Distributed-Pipeline-Parallelism-Using-RPC.png b/_static/img/thumbnails/cropped/Distributed-Pipeline-Parallelism-Using-RPC.png
new file mode 100644
index 00000000000..426a14d98f5
Binary files /dev/null and b/_static/img/thumbnails/cropped/Distributed-Pipeline-Parallelism-Using-RPC.png differ
diff --git a/_static/img/thumbnails/cropped/Exporting-PyTorch-Models-to-ONNX-Graphs.png b/_static/img/thumbnails/cropped/Exporting-PyTorch-Models-to-ONNX-Graphs.png
new file mode 100755
index 00000000000..00156df042e
Binary files /dev/null and b/_static/img/thumbnails/cropped/Exporting-PyTorch-Models-to-ONNX-Graphs.png differ
diff --git a/_static/img/thumbnails/cropped/Extending-TorchScript-with-Custom-Cpp-Classes.png b/_static/img/thumbnails/cropped/Extending-TorchScript-with-Custom-Cpp-Classes.png
new file mode 100644
index 00000000000..426a14d98f5
Binary files /dev/null and b/_static/img/thumbnails/cropped/Extending-TorchScript-with-Custom-Cpp-Classes.png differ
diff --git a/_static/img/thumbnails/cropped/Extending-TorchScript-with-Custom-Cpp-Operators.png b/_static/img/thumbnails/cropped/Extending-TorchScript-with-Custom-Cpp-Operators.png
new file mode 100644
index 00000000000..426a14d98f5
Binary files /dev/null and b/_static/img/thumbnails/cropped/Extending-TorchScript-with-Custom-Cpp-Operators.png differ
diff --git a/_static/img/thumbnails/cropped/Getting Started with Distributed-RPC-Framework.png b/_static/img/thumbnails/cropped/Getting Started with Distributed-RPC-Framework.png
new file mode 100644
index 00000000000..426a14d98f5
Binary files /dev/null and b/_static/img/thumbnails/cropped/Getting Started with Distributed-RPC-Framework.png differ
diff --git a/_static/img/thumbnails/cropped/Getting-Started-with Distributed RPC Framework.png b/_static/img/thumbnails/cropped/Getting-Started-with Distributed RPC Framework.png
new file mode 100644
index 00000000000..426a14d98f5
Binary files /dev/null and b/_static/img/thumbnails/cropped/Getting-Started-with Distributed RPC Framework.png differ
diff --git a/_static/img/thumbnails/cropped/Getting-Started-with-DCP.png b/_static/img/thumbnails/cropped/Getting-Started-with-DCP.png
new file mode 100644
index 00000000000..426a14d98f5
Binary files /dev/null and b/_static/img/thumbnails/cropped/Getting-Started-with-DCP.png differ
diff --git a/_static/img/thumbnails/cropped/Getting-Started-with-Distributed-Data-Parallel.png b/_static/img/thumbnails/cropped/Getting-Started-with-Distributed-Data-Parallel.png
new file mode 100644
index 00000000000..426a14d98f5
Binary files /dev/null and b/_static/img/thumbnails/cropped/Getting-Started-with-Distributed-Data-Parallel.png differ
diff --git a/_static/img/thumbnails/cropped/Getting-Started-with-Distributed-RPC-Framework.png b/_static/img/thumbnails/cropped/Getting-Started-with-Distributed-RPC-Framework.png
new file mode 100644
index 00000000000..426a14d98f5
Binary files /dev/null and b/_static/img/thumbnails/cropped/Getting-Started-with-Distributed-RPC-Framework.png differ
diff --git a/_static/img/thumbnails/cropped/Getting-Started-with-FSDP.png b/_static/img/thumbnails/cropped/Getting-Started-with-FSDP.png
new file mode 100644
index 00000000000..426a14d98f5
Binary files /dev/null and b/_static/img/thumbnails/cropped/Getting-Started-with-FSDP.png differ
diff --git a/_static/img/thumbnails/cropped/Implementing-Batch-RPC-Processing-Using-Asynchronous-Executions.png b/_static/img/thumbnails/cropped/Implementing-Batch-RPC-Processing-Using-Asynchronous-Executions.png
new file mode 100644
index 00000000000..426a14d98f5
Binary files /dev/null and b/_static/img/thumbnails/cropped/Implementing-Batch-RPC-Processing-Using-Asynchronous-Executions.png differ
diff --git a/_static/img/thumbnails/cropped/Implementing-a-Parameter-Server-Using-Distributed-RPC-Framework.png b/_static/img/thumbnails/cropped/Implementing-a-Parameter-Server-Using-Distributed-RPC-Framework.png
new file mode 100644
index 00000000000..426a14d98f5
Binary files /dev/null and b/_static/img/thumbnails/cropped/Implementing-a-Parameter-Server-Using-Distributed-RPC-Framework.png differ
diff --git a/_static/img/thumbnails/cropped/Introduction-to-TorchScript.png b/_static/img/thumbnails/cropped/Introduction-to-TorchScript.png
new file mode 100644
index 00000000000..426a14d98f5
Binary files /dev/null and b/_static/img/thumbnails/cropped/Introduction-to-TorchScript.png differ
diff --git a/_static/img/thumbnails/cropped/Language-Translation-with-TorchText.png b/_static/img/thumbnails/cropped/Language-Translation-with-TorchText.png
new file mode 100644
index 00000000000..9330c6cbdb5
Binary files /dev/null and b/_static/img/thumbnails/cropped/Language-Translation-with-TorchText.png differ
diff --git a/_static/img/thumbnails/cropped/Large-Scale-Transformer-model-training-with-Tensor-Parallel.png b/_static/img/thumbnails/cropped/Large-Scale-Transformer-model-training-with-Tensor-Parallel.png
new file mode 100644
index 00000000000..426a14d98f5
Binary files /dev/null and b/_static/img/thumbnails/cropped/Large-Scale-Transformer-model-training-with-Tensor-Parallel.png differ
diff --git a/_static/img/thumbnails/cropped/Loading-a-TorchScript-Model-in-Cpp.png b/_static/img/thumbnails/cropped/Loading-a-TorchScript-Model-in-Cpp.png
new file mode 100644
index 00000000000..426a14d98f5
Binary files /dev/null and b/_static/img/thumbnails/cropped/Loading-a-TorchScript-Model-in-Cpp.png differ
diff --git a/_static/img/thumbnails/cropped/Model-Parallel-Best-Practices.png b/_static/img/thumbnails/cropped/Model-Parallel-Best-Practices.png
new file mode 100644
index 00000000000..426a14d98f5
Binary files /dev/null and b/_static/img/thumbnails/cropped/Model-Parallel-Best-Practices.png differ
diff --git a/_static/img/thumbnails/cropped/NLP-From-Scratch-Classifying-Names-with-a-Character-Level-RNN.png b/_static/img/thumbnails/cropped/NLP-From-Scratch-Classifying-Names-with-a-Character-Level-RNN.png
new file mode 100644
index 00000000000..0aa02de9a53
Binary files /dev/null and b/_static/img/thumbnails/cropped/NLP-From-Scratch-Classifying-Names-with-a-Character-Level-RNN.png differ
diff --git a/_static/img/thumbnails/cropped/NLP-From-Scratch-Generating-Names-with-a-Character-Level-RNN.png b/_static/img/thumbnails/cropped/NLP-From-Scratch-Generating-Names-with-a-Character-Level-RNN.png
new file mode 100644
index 00000000000..a63d82ba4b4
Binary files /dev/null and b/_static/img/thumbnails/cropped/NLP-From-Scratch-Generating-Names-with-a-Character-Level-RNN.png differ
diff --git a/_static/img/thumbnails/cropped/NLP-From-Scratch-Translation-with-a-Sequence-to-Sequence-Network-and-Attention.png b/_static/img/thumbnails/cropped/NLP-From-Scratch-Translation-with-a-Sequence-to-Sequence-Network-and-Attention.png
new file mode 100644
index 00000000000..11d4f07c3bf
Binary files /dev/null and b/_static/img/thumbnails/cropped/NLP-From-Scratch-Translation-with-a-Sequence-to-Sequence-Network-and-Attention.png differ
diff --git a/_static/img/thumbnails/cropped/Pruning-Tutorial.png b/_static/img/thumbnails/cropped/Pruning-Tutorial.png
new file mode 100644
index 00000000000..32953c7ab19
Binary files /dev/null and b/_static/img/thumbnails/cropped/Pruning-Tutorial.png differ
diff --git a/_static/img/thumbnails/cropped/PyTorch-Distributed-Overview.png b/_static/img/thumbnails/cropped/PyTorch-Distributed-Overview.png
new file mode 100644
index 00000000000..426a14d98f5
Binary files /dev/null and b/_static/img/thumbnails/cropped/PyTorch-Distributed-Overview.png differ
diff --git a/_static/img/thumbnails/cropped/Sequence-to-Sequence-Modeling-with-nnTransformer-andTorchText.png b/_static/img/thumbnails/cropped/Sequence-to-Sequence-Modeling-with-nnTransformer-andTorchText.png
new file mode 100644
index 00000000000..00c4a236f24
Binary files /dev/null and b/_static/img/thumbnails/cropped/Sequence-to-Sequence-Modeling-with-nnTransformer-andTorchText.png differ
diff --git a/_static/img/thumbnails/cropped/TIAToolbox-Tutorial.png b/_static/img/thumbnails/cropped/TIAToolbox-Tutorial.png
new file mode 100644
index 00000000000..76f2bcaf4de
Binary files /dev/null and b/_static/img/thumbnails/cropped/TIAToolbox-Tutorial.png differ
diff --git a/_static/img/thumbnails/cropped/Text-Classification-with-TorchText.png b/_static/img/thumbnails/cropped/Text-Classification-with-TorchText.png
new file mode 100644
index 00000000000..e46aa333390
Binary files /dev/null and b/_static/img/thumbnails/cropped/Text-Classification-with-TorchText.png differ
diff --git a/_static/img/thumbnails/cropped/TorchScript-Parallelism.jpg b/_static/img/thumbnails/cropped/TorchScript-Parallelism.jpg
new file mode 100644
index 00000000000..237990a0460
Binary files /dev/null and b/_static/img/thumbnails/cropped/TorchScript-Parallelism.jpg differ
diff --git a/_static/img/thumbnails/cropped/TorchVision-Object-Detection-Finetuning-Tutorial.png b/_static/img/thumbnails/cropped/TorchVision-Object-Detection-Finetuning-Tutorial.png
new file mode 100644
index 00000000000..e79ff0d395e
Binary files /dev/null and b/_static/img/thumbnails/cropped/TorchVision-Object-Detection-Finetuning-Tutorial.png differ
diff --git a/_static/img/thumbnails/cropped/Training-Transformer-Models-using-Distributed-Data-Parallel-and-Pipeline-Parallelism.png b/_static/img/thumbnails/cropped/Training-Transformer-Models-using-Distributed-Data-Parallel-and-Pipeline-Parallelism.png
new file mode 100644
index 00000000000..426a14d98f5
Binary files /dev/null and b/_static/img/thumbnails/cropped/Training-Transformer-Models-using-Distributed-Data-Parallel-and-Pipeline-Parallelism.png differ
diff --git a/_static/img/thumbnails/cropped/Training-Transformer-models-using-Pipeline-Parallelism.png b/_static/img/thumbnails/cropped/Training-Transformer-models-using-Pipeline-Parallelism.png
new file mode 100644
index 00000000000..426a14d98f5
Binary files /dev/null and b/_static/img/thumbnails/cropped/Training-Transformer-models-using-Pipeline-Parallelism.png differ
diff --git a/_static/img/thumbnails/cropped/Transfer-Learning-for-Computer-Vision-Tutorial.png b/_static/img/thumbnails/cropped/Transfer-Learning-for-Computer-Vision-Tutorial.png
new file mode 100644
index 00000000000..029f0ff1bea
Binary files /dev/null and b/_static/img/thumbnails/cropped/Transfer-Learning-for-Computer-Vision-Tutorial.png differ
diff --git a/_static/img/thumbnails/cropped/Tutorials_Card_Template.psd b/_static/img/thumbnails/cropped/Tutorials_Card_Template.psd
new file mode 100644
index 00000000000..6caf48a5951
Binary files /dev/null and b/_static/img/thumbnails/cropped/Tutorials_Card_Template.psd differ
diff --git a/_static/img/thumbnails/cropped/Using-the-PyTorch-Cpp-Frontend.png b/_static/img/thumbnails/cropped/Using-the-PyTorch-Cpp-Frontend.png
new file mode 100644
index 00000000000..3aec75031ae
Binary files /dev/null and b/_static/img/thumbnails/cropped/Using-the-PyTorch-Cpp-Frontend.png differ
diff --git a/_static/img/thumbnails/cropped/Writing-Distributed-Applications-with-PyTorch.png b/_static/img/thumbnails/cropped/Writing-Distributed-Applications-with-PyTorch.png
new file mode 100644
index 00000000000..426a14d98f5
Binary files /dev/null and b/_static/img/thumbnails/cropped/Writing-Distributed-Applications-with-PyTorch.png differ
diff --git a/_static/img/thumbnails/cropped/advanced-PyTorch-1point0-Distributed-Trainer-with-Amazon-AWS.png b/_static/img/thumbnails/cropped/advanced-PyTorch-1point0-Distributed-Trainer-with-Amazon-AWS.png
new file mode 100644
index 00000000000..426a14d98f5
Binary files /dev/null and b/_static/img/thumbnails/cropped/advanced-PyTorch-1point0-Distributed-Trainer-with-Amazon-AWS.png differ
diff --git a/_static/img/thumbnails/cropped/amp.png b/_static/img/thumbnails/cropped/amp.png
new file mode 100644
index 00000000000..a6916ce5605
Binary files /dev/null and b/_static/img/thumbnails/cropped/amp.png differ
diff --git a/_static/img/thumbnails/cropped/android.png b/_static/img/thumbnails/cropped/android.png
new file mode 100644
index 00000000000..5c6079d9090
Binary files /dev/null and b/_static/img/thumbnails/cropped/android.png differ
diff --git a/_static/img/thumbnails/cropped/custom-datasets-transforms-and-dataloaders.png b/_static/img/thumbnails/cropped/custom-datasets-transforms-and-dataloaders.png
new file mode 100644
index 00000000000..5f73aa5663c
Binary files /dev/null and b/_static/img/thumbnails/cropped/custom-datasets-transforms-and-dataloaders.png differ
diff --git a/_static/img/thumbnails/cropped/defining-a-network.PNG b/_static/img/thumbnails/cropped/defining-a-network.PNG
new file mode 100644
index 00000000000..ded6a9ed583
Binary files /dev/null and b/_static/img/thumbnails/cropped/defining-a-network.PNG differ
diff --git a/_static/img/thumbnails/cropped/experimental-Channels-Last-Memory-Format-in-PyTorch.png b/_static/img/thumbnails/cropped/experimental-Channels-Last-Memory-Format-in-PyTorch.png
new file mode 100644
index 00000000000..18cbc1d0bc2
Binary files /dev/null and b/_static/img/thumbnails/cropped/experimental-Channels-Last-Memory-Format-in-PyTorch.png differ
diff --git a/_static/img/thumbnails/cropped/experimental-Introduction-to-Named-Tensors-in-PyTorch.png b/_static/img/thumbnails/cropped/experimental-Introduction-to-Named-Tensors-in-PyTorch.png
new file mode 100644
index 00000000000..d52414ec275
Binary files /dev/null and b/_static/img/thumbnails/cropped/experimental-Introduction-to-Named-Tensors-in-PyTorch.png differ
diff --git a/_static/img/thumbnails/cropped/experimental-Quantized-Transfer-Learning-for-Computer-Vision-Tutorial.png b/_static/img/thumbnails/cropped/experimental-Quantized-Transfer-Learning-for-Computer-Vision-Tutorial.png
new file mode 100644
index 00000000000..d826d8170c1
Binary files /dev/null and b/_static/img/thumbnails/cropped/experimental-Quantized-Transfer-Learning-for-Computer-Vision-Tutorial.png differ
diff --git a/_static/img/thumbnails/cropped/experimental-Static-Quantization-with-Eager-Mode-in-PyTorch.png b/_static/img/thumbnails/cropped/experimental-Static-Quantization-with-Eager-Mode-in-PyTorch.png
new file mode 100644
index 00000000000..d826d8170c1
Binary files /dev/null and b/_static/img/thumbnails/cropped/experimental-Static-Quantization-with-Eager-Mode-in-PyTorch.png differ
diff --git a/_static/img/thumbnails/cropped/generic-pytorch-logo.png b/_static/img/thumbnails/cropped/generic-pytorch-logo.png
new file mode 100644
index 00000000000..426a14d98f5
Binary files /dev/null and b/_static/img/thumbnails/cropped/generic-pytorch-logo.png differ
diff --git a/_static/img/thumbnails/cropped/ios.png b/_static/img/thumbnails/cropped/ios.png
new file mode 100644
index 00000000000..8c1d4a2b04d
Binary files /dev/null and b/_static/img/thumbnails/cropped/ios.png differ
diff --git a/_static/img/thumbnails/cropped/knowledge_distillation_pytorch_logo.png b/_static/img/thumbnails/cropped/knowledge_distillation_pytorch_logo.png
new file mode 100644
index 00000000000..3ce40781542
Binary files /dev/null and b/_static/img/thumbnails/cropped/knowledge_distillation_pytorch_logo.png differ
diff --git a/_static/img/thumbnails/cropped/learning-pytorch-with-examples.png b/_static/img/thumbnails/cropped/learning-pytorch-with-examples.png
new file mode 100644
index 00000000000..b292603835b
Binary files /dev/null and b/_static/img/thumbnails/cropped/learning-pytorch-with-examples.png differ
diff --git a/_static/img/thumbnails/cropped/loading-data-in-pytorch.png b/_static/img/thumbnails/cropped/loading-data-in-pytorch.png
new file mode 100644
index 00000000000..20309e32cf5
Binary files /dev/null and b/_static/img/thumbnails/cropped/loading-data-in-pytorch.png differ
diff --git a/_static/img/thumbnails/cropped/loading-data.PNG b/_static/img/thumbnails/cropped/loading-data.PNG
new file mode 100644
index 00000000000..0cb07e34e5e
Binary files /dev/null and b/_static/img/thumbnails/cropped/loading-data.PNG differ
diff --git a/_static/img/thumbnails/cropped/model-interpretability-using-captum.png b/_static/img/thumbnails/cropped/model-interpretability-using-captum.png
new file mode 100644
index 00000000000..2e531ae27c9
Binary files /dev/null and b/_static/img/thumbnails/cropped/model-interpretability-using-captum.png differ
diff --git a/_static/img/thumbnails/cropped/parametrizations.png b/_static/img/thumbnails/cropped/parametrizations.png
new file mode 100644
index 00000000000..426a14d98f5
Binary files /dev/null and b/_static/img/thumbnails/cropped/parametrizations.png differ
diff --git a/_static/img/thumbnails/cropped/profile.png b/_static/img/thumbnails/cropped/profile.png
new file mode 100644
index 00000000000..372db8bbe87
Binary files /dev/null and b/_static/img/thumbnails/cropped/profile.png differ
diff --git a/_static/img/thumbnails/cropped/profiler.png b/_static/img/thumbnails/cropped/profiler.png
new file mode 100644
index 00000000000..426a14d98f5
Binary files /dev/null and b/_static/img/thumbnails/cropped/profiler.png differ
diff --git a/_static/img/thumbnails/cropped/pytorch-logo.png b/_static/img/thumbnails/cropped/pytorch-logo.png
new file mode 100644
index 00000000000..426a14d98f5
Binary files /dev/null and b/_static/img/thumbnails/cropped/pytorch-logo.png differ
diff --git a/_static/img/thumbnails/cropped/realtime_rpi.png b/_static/img/thumbnails/cropped/realtime_rpi.png
new file mode 100644
index 00000000000..b233f3df3a1
Binary files /dev/null and b/_static/img/thumbnails/cropped/realtime_rpi.png differ
diff --git a/_static/img/thumbnails/cropped/saving-and-loading-general-checkpoint.PNG b/_static/img/thumbnails/cropped/saving-and-loading-general-checkpoint.PNG
new file mode 100644
index 00000000000..ba351430712
Binary files /dev/null and b/_static/img/thumbnails/cropped/saving-and-loading-general-checkpoint.PNG differ
diff --git a/_static/img/thumbnails/cropped/saving-and-loading-models-across-devices.PNG b/_static/img/thumbnails/cropped/saving-and-loading-models-across-devices.PNG
new file mode 100644
index 00000000000..a1c337928a1
Binary files /dev/null and b/_static/img/thumbnails/cropped/saving-and-loading-models-across-devices.PNG differ
diff --git a/_static/img/thumbnails/cropped/saving-and-loading-models-for-inference.PNG b/_static/img/thumbnails/cropped/saving-and-loading-models-for-inference.PNG
new file mode 100644
index 00000000000..b8075559c1d
Binary files /dev/null and b/_static/img/thumbnails/cropped/saving-and-loading-models-for-inference.PNG differ
diff --git a/_static/img/thumbnails/cropped/saving-multiple-models.PNG b/_static/img/thumbnails/cropped/saving-multiple-models.PNG
new file mode 100644
index 00000000000..2917cac557a
Binary files /dev/null and b/_static/img/thumbnails/cropped/saving-multiple-models.PNG differ
diff --git a/_static/img/thumbnails/cropped/torch-nn.png b/_static/img/thumbnails/cropped/torch-nn.png
new file mode 100644
index 00000000000..44a3e8dca1d
Binary files /dev/null and b/_static/img/thumbnails/cropped/torch-nn.png differ
diff --git a/_static/img/thumbnails/cropped/torch_text_logo.png b/_static/img/thumbnails/cropped/torch_text_logo.png
new file mode 100644
index 00000000000..3fe736d60e2
Binary files /dev/null and b/_static/img/thumbnails/cropped/torch_text_logo.png differ
diff --git a/_static/img/thumbnails/cropped/torchaudio-Tutorial.png b/_static/img/thumbnails/cropped/torchaudio-Tutorial.png
new file mode 100644
index 00000000000..c49aa17c02f
Binary files /dev/null and b/_static/img/thumbnails/cropped/torchaudio-Tutorial.png differ
diff --git a/_static/img/thumbnails/cropped/torchaudio-alignment.png b/_static/img/thumbnails/cropped/torchaudio-alignment.png
new file mode 100644
index 00000000000..d5a25f35219
Binary files /dev/null and b/_static/img/thumbnails/cropped/torchaudio-alignment.png differ
diff --git a/_static/img/thumbnails/cropped/torchaudio-asr.png b/_static/img/thumbnails/cropped/torchaudio-asr.png
new file mode 100644
index 00000000000..ff84f3ff3f1
Binary files /dev/null and b/_static/img/thumbnails/cropped/torchaudio-asr.png differ
diff --git a/_static/img/thumbnails/cropped/torchaudio-speech.png b/_static/img/thumbnails/cropped/torchaudio-speech.png
new file mode 100644
index 00000000000..c874a6bb482
Binary files /dev/null and b/_static/img/thumbnails/cropped/torchaudio-speech.png differ
diff --git a/_static/img/thumbnails/cropped/torchscript_overview.png b/_static/img/thumbnails/cropped/torchscript_overview.png
new file mode 100644
index 00000000000..63e599b1a80
Binary files /dev/null and b/_static/img/thumbnails/cropped/torchscript_overview.png differ
diff --git a/_static/img/thumbnails/cropped/understanding_leaf_vs_nonleaf.png b/_static/img/thumbnails/cropped/understanding_leaf_vs_nonleaf.png
new file mode 100644
index 00000000000..0590cf227d9
Binary files /dev/null and b/_static/img/thumbnails/cropped/understanding_leaf_vs_nonleaf.png differ
diff --git a/_static/img/thumbnails/cropped/using-flask-create-restful-api.png b/_static/img/thumbnails/cropped/using-flask-create-restful-api.png
new file mode 100644
index 00000000000..176c4de6d5b
Binary files /dev/null and b/_static/img/thumbnails/cropped/using-flask-create-restful-api.png differ
diff --git a/_static/img/thumbnails/cropped/visualizing-with-tensorboard.png b/_static/img/thumbnails/cropped/visualizing-with-tensorboard.png
new file mode 100644
index 00000000000..8fdecca65fe
Binary files /dev/null and b/_static/img/thumbnails/cropped/visualizing-with-tensorboard.png differ
diff --git a/_static/img/thumbnails/cropped/visualizing_gradients_tutorial.png b/_static/img/thumbnails/cropped/visualizing_gradients_tutorial.png
new file mode 100644
index 00000000000..6ff6d97f2e2
Binary files /dev/null and b/_static/img/thumbnails/cropped/visualizing_gradients_tutorial.png differ
diff --git a/_static/img/thumbnails/cropped/warmstarting-models.PNG b/_static/img/thumbnails/cropped/warmstarting-models.PNG
new file mode 100644
index 00000000000..385f2ab80c8
Binary files /dev/null and b/_static/img/thumbnails/cropped/warmstarting-models.PNG differ
diff --git a/_static/img/thumbnails/cropped/what-is-a-state-dict.PNG b/_static/img/thumbnails/cropped/what-is-a-state-dict.PNG
new file mode 100644
index 00000000000..b0eee89ad73
Binary files /dev/null and b/_static/img/thumbnails/cropped/what-is-a-state-dict.PNG differ
diff --git a/_static/img/thumbnails/cropped/zeroing-out-gradients.PNG b/_static/img/thumbnails/cropped/zeroing-out-gradients.PNG
new file mode 100644
index 00000000000..0f21b230abf
Binary files /dev/null and b/_static/img/thumbnails/cropped/zeroing-out-gradients.PNG differ
diff --git a/_static/img/thumbnails/custom_dataset.png b/_static/img/thumbnails/custom_dataset.png
new file mode 100644
index 00000000000..59a8993bc4a
Binary files /dev/null and b/_static/img/thumbnails/custom_dataset.png differ
diff --git a/_static/img/thumbnails/default.png b/_static/img/thumbnails/default.png
old mode 100644
new mode 100755
diff --git a/_static/img/thumbnails/defining_a_network.png b/_static/img/thumbnails/defining_a_network.png
new file mode 100644
index 00000000000..f0c0a940713
Binary files /dev/null and b/_static/img/thumbnails/defining_a_network.png differ
diff --git a/_static/img/thumbnails/examples.png b/_static/img/thumbnails/examples.png
old mode 100644
new mode 100755
diff --git a/_static/img/thumbnails/eye.png b/_static/img/thumbnails/eye.png
new file mode 100755
index 00000000000..6feec8d9558
Binary files /dev/null and b/_static/img/thumbnails/eye.png differ
diff --git a/_static/img/thumbnails/floppy.png b/_static/img/thumbnails/floppy.png
new file mode 100755
index 00000000000..ba3ad2c3533
Binary files /dev/null and b/_static/img/thumbnails/floppy.png differ
diff --git a/_static/img/thumbnails/german_to_english_translation.png b/_static/img/thumbnails/german_to_english_translation.png
new file mode 100644
index 00000000000..a3560c574a6
Binary files /dev/null and b/_static/img/thumbnails/german_to_english_translation.png differ
diff --git a/_static/img/thumbnails/landmarked_face2.png b/_static/img/thumbnails/landmarked_face2.png
new file mode 100755
index 00000000000..2faa58b37f2
Binary files /dev/null and b/_static/img/thumbnails/landmarked_face2.png differ
diff --git a/_static/img/thumbnails/pixelated-cat.png b/_static/img/thumbnails/pixelated-cat.png
new file mode 100644
index 00000000000..c3d527f0691
Binary files /dev/null and b/_static/img/thumbnails/pixelated-cat.png differ
diff --git a/_static/img/thumbnails/pytorch-logo-flat.png b/_static/img/thumbnails/pytorch-logo-flat.png
old mode 100644
new mode 100755
diff --git a/_static/img/thumbnails/pytorch_tensorboard.png b/_static/img/thumbnails/pytorch_tensorboard.png
new file mode 100644
index 00000000000..2dce6a6e268
Binary files /dev/null and b/_static/img/thumbnails/pytorch_tensorboard.png differ
diff --git a/_static/img/thumbnails/sphx_glr_transfer_learning_tutorial_001.png b/_static/img/thumbnails/sphx_glr_transfer_learning_tutorial_001.png
new file mode 100755
index 00000000000..42372d51c89
Binary files /dev/null and b/_static/img/thumbnails/sphx_glr_transfer_learning_tutorial_001.png differ
diff --git a/_static/img/thumbnails/tensorboard_dev.png b/_static/img/thumbnails/tensorboard_dev.png
new file mode 100644
index 00000000000..056839a6359
Binary files /dev/null and b/_static/img/thumbnails/tensorboard_dev.png differ
diff --git a/_static/img/thumbnails/tensorboard_scalars.png b/_static/img/thumbnails/tensorboard_scalars.png
new file mode 100644
index 00000000000..ab6734ab3e9
Binary files /dev/null and b/_static/img/thumbnails/tensorboard_scalars.png differ
diff --git a/_static/img/thumbnails/torch-logo.png b/_static/img/thumbnails/torch-logo.png
old mode 100644
new mode 100755
diff --git a/_static/img/thumbnails/torchrec.png b/_static/img/thumbnails/torchrec.png
new file mode 100644
index 00000000000..1304b56873e
Binary files /dev/null and b/_static/img/thumbnails/torchrec.png differ
diff --git a/_static/img/thumbnails/torchtext.png b/_static/img/thumbnails/torchtext.png
new file mode 100644
index 00000000000..ee4285aef34
Binary files /dev/null and b/_static/img/thumbnails/torchtext.png differ
diff --git a/_static/img/thumbnails/tv-img.png b/_static/img/thumbnails/tv-img.png
new file mode 100644
index 00000000000..c2b2417f3f8
Binary files /dev/null and b/_static/img/thumbnails/tv-img.png differ
diff --git a/_static/img/torch-nn-vs-pytorch-nn.png b/_static/img/torch-nn-vs-pytorch-nn.png
old mode 100644
new mode 100755
diff --git a/_static/img/torch.nn.png b/_static/img/torch.nn.png
new file mode 100644
index 00000000000..3af3b087fa5
Binary files /dev/null and b/_static/img/torch.nn.png differ
diff --git a/_static/img/torchscript.png b/_static/img/torchscript.png
new file mode 100644
index 00000000000..b748d45d231
Binary files /dev/null and b/_static/img/torchscript.png differ
diff --git a/_static/img/torchscript_to_cpp.png b/_static/img/torchscript_to_cpp.png
new file mode 100644
index 00000000000..579d65b00d4
Binary files /dev/null and b/_static/img/torchscript_to_cpp.png differ
diff --git a/_static/img/torchserve-ipex-images-2/1.png b/_static/img/torchserve-ipex-images-2/1.png
new file mode 100644
index 00000000000..4a9f488236b
Binary files /dev/null and b/_static/img/torchserve-ipex-images-2/1.png differ
diff --git a/_static/img/torchserve-ipex-images-2/10.png b/_static/img/torchserve-ipex-images-2/10.png
new file mode 100644
index 00000000000..d56f34600d8
Binary files /dev/null and b/_static/img/torchserve-ipex-images-2/10.png differ
diff --git a/_static/img/torchserve-ipex-images-2/11.png b/_static/img/torchserve-ipex-images-2/11.png
new file mode 100644
index 00000000000..8ebbcc03d51
Binary files /dev/null and b/_static/img/torchserve-ipex-images-2/11.png differ
diff --git a/_static/img/torchserve-ipex-images-2/12.png b/_static/img/torchserve-ipex-images-2/12.png
new file mode 100644
index 00000000000..23c4794ae36
Binary files /dev/null and b/_static/img/torchserve-ipex-images-2/12.png differ
diff --git a/_static/img/torchserve-ipex-images-2/13.png b/_static/img/torchserve-ipex-images-2/13.png
new file mode 100644
index 00000000000..4e1dc6e1a03
Binary files /dev/null and b/_static/img/torchserve-ipex-images-2/13.png differ
diff --git a/_static/img/torchserve-ipex-images-2/14.png b/_static/img/torchserve-ipex-images-2/14.png
new file mode 100644
index 00000000000..701399e9d9b
Binary files /dev/null and b/_static/img/torchserve-ipex-images-2/14.png differ
diff --git a/_static/img/torchserve-ipex-images-2/15.png b/_static/img/torchserve-ipex-images-2/15.png
new file mode 100644
index 00000000000..b345a9d0d8c
Binary files /dev/null and b/_static/img/torchserve-ipex-images-2/15.png differ
diff --git a/_static/img/torchserve-ipex-images-2/16.png b/_static/img/torchserve-ipex-images-2/16.png
new file mode 100644
index 00000000000..39b5d6afb9c
Binary files /dev/null and b/_static/img/torchserve-ipex-images-2/16.png differ
diff --git a/_static/img/torchserve-ipex-images-2/17.png b/_static/img/torchserve-ipex-images-2/17.png
new file mode 100644
index 00000000000..bb7359bcbe6
Binary files /dev/null and b/_static/img/torchserve-ipex-images-2/17.png differ
diff --git a/_static/img/torchserve-ipex-images-2/18.png b/_static/img/torchserve-ipex-images-2/18.png
new file mode 100644
index 00000000000..30ad817a561
Binary files /dev/null and b/_static/img/torchserve-ipex-images-2/18.png differ
diff --git a/_static/img/torchserve-ipex-images-2/19.png b/_static/img/torchserve-ipex-images-2/19.png
new file mode 100644
index 00000000000..353bfb897a1
Binary files /dev/null and b/_static/img/torchserve-ipex-images-2/19.png differ
diff --git a/_static/img/torchserve-ipex-images-2/2.png b/_static/img/torchserve-ipex-images-2/2.png
new file mode 100644
index 00000000000..d7d351a3e74
Binary files /dev/null and b/_static/img/torchserve-ipex-images-2/2.png differ
diff --git a/_static/img/torchserve-ipex-images-2/20.png b/_static/img/torchserve-ipex-images-2/20.png
new file mode 100644
index 00000000000..aa94ff57dce
Binary files /dev/null and b/_static/img/torchserve-ipex-images-2/20.png differ
diff --git a/_static/img/torchserve-ipex-images-2/21.png b/_static/img/torchserve-ipex-images-2/21.png
new file mode 100644
index 00000000000..c714adc1453
Binary files /dev/null and b/_static/img/torchserve-ipex-images-2/21.png differ
diff --git a/_static/img/torchserve-ipex-images-2/22.png b/_static/img/torchserve-ipex-images-2/22.png
new file mode 100644
index 00000000000..fa7ae84c702
Binary files /dev/null and b/_static/img/torchserve-ipex-images-2/22.png differ
diff --git a/_static/img/torchserve-ipex-images-2/23.png b/_static/img/torchserve-ipex-images-2/23.png
new file mode 100644
index 00000000000..fd8a1bf8389
Binary files /dev/null and b/_static/img/torchserve-ipex-images-2/23.png differ
diff --git a/_static/img/torchserve-ipex-images-2/24.png b/_static/img/torchserve-ipex-images-2/24.png
new file mode 100644
index 00000000000..6ba858f98f0
Binary files /dev/null and b/_static/img/torchserve-ipex-images-2/24.png differ
diff --git a/_static/img/torchserve-ipex-images-2/3.png b/_static/img/torchserve-ipex-images-2/3.png
new file mode 100644
index 00000000000..6ae485bd132
Binary files /dev/null and b/_static/img/torchserve-ipex-images-2/3.png differ
diff --git a/_static/img/torchserve-ipex-images-2/4.png b/_static/img/torchserve-ipex-images-2/4.png
new file mode 100644
index 00000000000..b0fa5e68133
Binary files /dev/null and b/_static/img/torchserve-ipex-images-2/4.png differ
diff --git a/_static/img/torchserve-ipex-images-2/5.png b/_static/img/torchserve-ipex-images-2/5.png
new file mode 100644
index 00000000000..25adc177ad1
Binary files /dev/null and b/_static/img/torchserve-ipex-images-2/5.png differ
diff --git a/_static/img/torchserve-ipex-images-2/6.png b/_static/img/torchserve-ipex-images-2/6.png
new file mode 100644
index 00000000000..739d3b388d3
Binary files /dev/null and b/_static/img/torchserve-ipex-images-2/6.png differ
diff --git a/_static/img/torchserve-ipex-images-2/7.png b/_static/img/torchserve-ipex-images-2/7.png
new file mode 100644
index 00000000000..77765616d65
Binary files /dev/null and b/_static/img/torchserve-ipex-images-2/7.png differ
diff --git a/_static/img/torchserve-ipex-images-2/8.png b/_static/img/torchserve-ipex-images-2/8.png
new file mode 100644
index 00000000000..b731676cc21
Binary files /dev/null and b/_static/img/torchserve-ipex-images-2/8.png differ
diff --git a/_static/img/torchserve-ipex-images-2/9.png b/_static/img/torchserve-ipex-images-2/9.png
new file mode 100644
index 00000000000..9155201ab3c
Binary files /dev/null and b/_static/img/torchserve-ipex-images-2/9.png differ
diff --git a/_static/img/torchserve-ipex-images/1.png b/_static/img/torchserve-ipex-images/1.png
new file mode 100644
index 00000000000..fc8748b22a5
Binary files /dev/null and b/_static/img/torchserve-ipex-images/1.png differ
diff --git a/_static/img/torchserve-ipex-images/10.png b/_static/img/torchserve-ipex-images/10.png
new file mode 100644
index 00000000000..833a1bb7cf9
Binary files /dev/null and b/_static/img/torchserve-ipex-images/10.png differ
diff --git a/_static/img/torchserve-ipex-images/11.gif b/_static/img/torchserve-ipex-images/11.gif
new file mode 100644
index 00000000000..1c1a2644e8e
Binary files /dev/null and b/_static/img/torchserve-ipex-images/11.gif differ
diff --git a/_static/img/torchserve-ipex-images/12.png b/_static/img/torchserve-ipex-images/12.png
new file mode 100644
index 00000000000..b55968fd705
Binary files /dev/null and b/_static/img/torchserve-ipex-images/12.png differ
diff --git a/_static/img/torchserve-ipex-images/13.png b/_static/img/torchserve-ipex-images/13.png
new file mode 100644
index 00000000000..de9c08814e6
Binary files /dev/null and b/_static/img/torchserve-ipex-images/13.png differ
diff --git a/_static/img/torchserve-ipex-images/14.png b/_static/img/torchserve-ipex-images/14.png
new file mode 100644
index 00000000000..4d776d81647
Binary files /dev/null and b/_static/img/torchserve-ipex-images/14.png differ
diff --git a/_static/img/torchserve-ipex-images/15.png b/_static/img/torchserve-ipex-images/15.png
new file mode 100644
index 00000000000..513ccf8e053
Binary files /dev/null and b/_static/img/torchserve-ipex-images/15.png differ
diff --git a/_static/img/torchserve-ipex-images/16.png b/_static/img/torchserve-ipex-images/16.png
new file mode 100644
index 00000000000..3670d0a1dc4
Binary files /dev/null and b/_static/img/torchserve-ipex-images/16.png differ
diff --git a/_static/img/torchserve-ipex-images/17.png b/_static/img/torchserve-ipex-images/17.png
new file mode 100644
index 00000000000..5ab17373c95
Binary files /dev/null and b/_static/img/torchserve-ipex-images/17.png differ
diff --git a/_static/img/torchserve-ipex-images/18.png b/_static/img/torchserve-ipex-images/18.png
new file mode 100644
index 00000000000..50304884d3e
Binary files /dev/null and b/_static/img/torchserve-ipex-images/18.png differ
diff --git a/_static/img/torchserve-ipex-images/19.png b/_static/img/torchserve-ipex-images/19.png
new file mode 100644
index 00000000000..b123480530e
Binary files /dev/null and b/_static/img/torchserve-ipex-images/19.png differ
diff --git a/_static/img/torchserve-ipex-images/1_.png b/_static/img/torchserve-ipex-images/1_.png
new file mode 100644
index 00000000000..fc8748b22a5
Binary files /dev/null and b/_static/img/torchserve-ipex-images/1_.png differ
diff --git a/_static/img/torchserve-ipex-images/2.png b/_static/img/torchserve-ipex-images/2.png
new file mode 100644
index 00000000000..27633f25bcb
Binary files /dev/null and b/_static/img/torchserve-ipex-images/2.png differ
diff --git a/_static/img/torchserve-ipex-images/20.gif b/_static/img/torchserve-ipex-images/20.gif
new file mode 100644
index 00000000000..ba8e9e95315
Binary files /dev/null and b/_static/img/torchserve-ipex-images/20.gif differ
diff --git a/_static/img/torchserve-ipex-images/21.png b/_static/img/torchserve-ipex-images/21.png
new file mode 100644
index 00000000000..04b3ca622bf
Binary files /dev/null and b/_static/img/torchserve-ipex-images/21.png differ
diff --git a/_static/img/torchserve-ipex-images/22.png b/_static/img/torchserve-ipex-images/22.png
new file mode 100644
index 00000000000..cbb2c269a90
Binary files /dev/null and b/_static/img/torchserve-ipex-images/22.png differ
diff --git a/_static/img/torchserve-ipex-images/23.png b/_static/img/torchserve-ipex-images/23.png
new file mode 100644
index 00000000000..c9bc44463f3
Binary files /dev/null and b/_static/img/torchserve-ipex-images/23.png differ
diff --git a/_static/img/torchserve-ipex-images/24.png b/_static/img/torchserve-ipex-images/24.png
new file mode 100644
index 00000000000..8b5718c30f3
Binary files /dev/null and b/_static/img/torchserve-ipex-images/24.png differ
diff --git a/_static/img/torchserve-ipex-images/25.png b/_static/img/torchserve-ipex-images/25.png
new file mode 100644
index 00000000000..4de920e632b
Binary files /dev/null and b/_static/img/torchserve-ipex-images/25.png differ
diff --git a/_static/img/torchserve-ipex-images/26.gif b/_static/img/torchserve-ipex-images/26.gif
new file mode 100644
index 00000000000..60a5a64ad15
Binary files /dev/null and b/_static/img/torchserve-ipex-images/26.gif differ
diff --git a/_static/img/torchserve-ipex-images/27.png b/_static/img/torchserve-ipex-images/27.png
new file mode 100644
index 00000000000..c7e766155f5
Binary files /dev/null and b/_static/img/torchserve-ipex-images/27.png differ
diff --git a/_static/img/torchserve-ipex-images/28.png b/_static/img/torchserve-ipex-images/28.png
new file mode 100644
index 00000000000..b7056c4c4ac
Binary files /dev/null and b/_static/img/torchserve-ipex-images/28.png differ
diff --git a/_static/img/torchserve-ipex-images/29.png b/_static/img/torchserve-ipex-images/29.png
new file mode 100644
index 00000000000..9dcd8735111
Binary files /dev/null and b/_static/img/torchserve-ipex-images/29.png differ
diff --git a/_static/img/torchserve-ipex-images/3.png b/_static/img/torchserve-ipex-images/3.png
new file mode 100644
index 00000000000..2309071571c
Binary files /dev/null and b/_static/img/torchserve-ipex-images/3.png differ
diff --git a/_static/img/torchserve-ipex-images/30.png b/_static/img/torchserve-ipex-images/30.png
new file mode 100644
index 00000000000..96b07ec7205
Binary files /dev/null and b/_static/img/torchserve-ipex-images/30.png differ
diff --git a/_static/img/torchserve-ipex-images/31.png b/_static/img/torchserve-ipex-images/31.png
new file mode 100644
index 00000000000..601b63e511d
Binary files /dev/null and b/_static/img/torchserve-ipex-images/31.png differ
diff --git a/_static/img/torchserve-ipex-images/4.png b/_static/img/torchserve-ipex-images/4.png
new file mode 100644
index 00000000000..f12d8c7cc40
Binary files /dev/null and b/_static/img/torchserve-ipex-images/4.png differ
diff --git a/_static/img/torchserve-ipex-images/5.png b/_static/img/torchserve-ipex-images/5.png
new file mode 100644
index 00000000000..55e05e5e53c
Binary files /dev/null and b/_static/img/torchserve-ipex-images/5.png differ
diff --git a/_static/img/torchserve-ipex-images/6.png b/_static/img/torchserve-ipex-images/6.png
new file mode 100644
index 00000000000..59a028f94b9
Binary files /dev/null and b/_static/img/torchserve-ipex-images/6.png differ
diff --git a/_static/img/torchserve-ipex-images/7.png b/_static/img/torchserve-ipex-images/7.png
new file mode 100644
index 00000000000..5739cb4f53a
Binary files /dev/null and b/_static/img/torchserve-ipex-images/7.png differ
diff --git a/_static/img/torchserve-ipex-images/8.png b/_static/img/torchserve-ipex-images/8.png
new file mode 100644
index 00000000000..1e6531b6cab
Binary files /dev/null and b/_static/img/torchserve-ipex-images/8.png differ
diff --git a/_static/img/torchserve-ipex-images/9.gif b/_static/img/torchserve-ipex-images/9.gif
new file mode 100644
index 00000000000..682e2f3425e
Binary files /dev/null and b/_static/img/torchserve-ipex-images/9.gif differ
diff --git a/_static/img/trace_img.png b/_static/img/trace_img.png
new file mode 100644
index 00000000000..8c540ceb519
Binary files /dev/null and b/_static/img/trace_img.png differ
diff --git a/_static/img/trace_xpu_img.png b/_static/img/trace_xpu_img.png
new file mode 100644
index 00000000000..2eca0a78cb6
Binary files /dev/null and b/_static/img/trace_xpu_img.png differ
diff --git a/_static/img/transformer_architecture.jpg b/_static/img/transformer_architecture.jpg
new file mode 100644
index 00000000000..4188fae7c85
Binary files /dev/null and b/_static/img/transformer_architecture.jpg differ
diff --git a/_static/img/transformer_input_target.png b/_static/img/transformer_input_target.png
new file mode 100644
index 00000000000..02e87174762
Binary files /dev/null and b/_static/img/transformer_input_target.png differ
diff --git a/_static/img/tts_pipeline.png b/_static/img/tts_pipeline.png
new file mode 100644
index 00000000000..5dc37ae9ddd
Binary files /dev/null and b/_static/img/tts_pipeline.png differ
diff --git a/_static/img/tv_tutorial/tv_image03.png b/_static/img/tv_tutorial/tv_image03.png
new file mode 100644
index 00000000000..54878b15dde
Binary files /dev/null and b/_static/img/tv_tutorial/tv_image03.png differ
diff --git a/_static/img/tv_tutorial/tv_image04.png b/_static/img/tv_tutorial/tv_image04.png
new file mode 100644
index 00000000000..229bf711329
Binary files /dev/null and b/_static/img/tv_tutorial/tv_image04.png differ
diff --git a/_static/img/understanding_leaf_vs_nonleaf/comp-graph-1.png b/_static/img/understanding_leaf_vs_nonleaf/comp-graph-1.png
new file mode 100644
index 00000000000..1fa3d80d339
Binary files /dev/null and b/_static/img/understanding_leaf_vs_nonleaf/comp-graph-1.png differ
diff --git a/_static/img/understanding_leaf_vs_nonleaf/comp-graph-2.png b/_static/img/understanding_leaf_vs_nonleaf/comp-graph-2.png
new file mode 100644
index 00000000000..3f76deab3bf
Binary files /dev/null and b/_static/img/understanding_leaf_vs_nonleaf/comp-graph-2.png differ
diff --git a/_static/img/usb_semisup_learn/code.png b/_static/img/usb_semisup_learn/code.png
new file mode 100644
index 00000000000..fdc7b798a37
Binary files /dev/null and b/_static/img/usb_semisup_learn/code.png differ
diff --git a/_static/minus.png b/_static/minus.png
new file mode 100755
index 00000000000..d96755fdaf8
Binary files /dev/null and b/_static/minus.png differ
diff --git a/_static/mnist.pkl.gz b/_static/mnist.pkl.gz
new file mode 100644
index 00000000000..6a739549cc6
Binary files /dev/null and b/_static/mnist.pkl.gz differ
diff --git a/_static/no_image.png b/_static/no_image.png
new file mode 100755
index 00000000000..8c2d48d5d3f
Binary files /dev/null and b/_static/no_image.png differ
diff --git a/_static/pencil-16.png b/_static/pencil-16.png
new file mode 100644
index 00000000000..6a4f1cf688e
Binary files /dev/null and b/_static/pencil-16.png differ
diff --git a/_static/plus.png b/_static/plus.png
new file mode 100755
index 00000000000..7107cec93a9
Binary files /dev/null and b/_static/plus.png differ
diff --git a/_static/pygments.css b/_static/pygments.css
new file mode 100755
index 00000000000..20c4814dcf0
--- /dev/null
+++ b/_static/pygments.css
@@ -0,0 +1,69 @@
+.highlight .hll { background-color: #ffffcc }
+.highlight  { background: #eeffcc; }
+.highlight .c { color: #408090; font-style: italic } /* Comment */
+.highlight .err { border: 1px solid #FF0000 } /* Error */
+.highlight .k { color: #007020; font-weight: bold } /* Keyword */
+.highlight .o { color: #666666 } /* Operator */
+.highlight .ch { color: #408090; font-style: italic } /* Comment.Hashbang */
+.highlight .cm { color: #408090; font-style: italic } /* Comment.Multiline */
+.highlight .cp { color: #007020 } /* Comment.Preproc */
+.highlight .cpf { color: #408090; font-style: italic } /* Comment.PreprocFile */
+.highlight .c1 { color: #408090; font-style: italic } /* Comment.Single */
+.highlight .cs { color: #408090; background-color: #fff0f0 } /* Comment.Special */
+.highlight .gd { color: #A00000 } /* Generic.Deleted */
+.highlight .ge { font-style: italic } /* Generic.Emph */
+.highlight .gr { color: #FF0000 } /* Generic.Error */
+.highlight .gh { color: #000080; font-weight: bold } /* Generic.Heading */
+.highlight .gi { color: #00A000 } /* Generic.Inserted */
+.highlight .go { color: #333333 } /* Generic.Output */
+.highlight .gp { color: #c65d09; font-weight: bold } /* Generic.Prompt */
+.highlight .gs { font-weight: bold } /* Generic.Strong */
+.highlight .gu { color: #800080; font-weight: bold } /* Generic.Subheading */
+.highlight .gt { color: #0044DD } /* Generic.Traceback */
+.highlight .kc { color: #007020; font-weight: bold } /* Keyword.Constant */
+.highlight .kd { color: #007020; font-weight: bold } /* Keyword.Declaration */
+.highlight .kn { color: #007020; font-weight: bold } /* Keyword.Namespace */
+.highlight .kp { color: #007020 } /* Keyword.Pseudo */
+.highlight .kr { color: #007020; font-weight: bold } /* Keyword.Reserved */
+.highlight .kt { color: #902000 } /* Keyword.Type */
+.highlight .m { color: #208050 } /* Literal.Number */
+.highlight .s { color: #4070a0 } /* Literal.String */
+.highlight .na { color: #4070a0 } /* Name.Attribute */
+.highlight .nb { color: #007020 } /* Name.Builtin */
+.highlight .nc { color: #0e84b5; font-weight: bold } /* Name.Class */
+.highlight .no { color: #60add5 } /* Name.Constant */
+.highlight .nd { color: #555555; font-weight: bold } /* Name.Decorator */
+.highlight .ni { color: #d55537; font-weight: bold } /* Name.Entity */
+.highlight .ne { color: #007020 } /* Name.Exception */
+.highlight .nf { color: #06287e } /* Name.Function */
+.highlight .nl { color: #002070; font-weight: bold } /* Name.Label */
+.highlight .nn { color: #0e84b5; font-weight: bold } /* Name.Namespace */
+.highlight .nt { color: #062873; font-weight: bold } /* Name.Tag */
+.highlight .nv { color: #bb60d5 } /* Name.Variable */
+.highlight .ow { color: #007020; font-weight: bold } /* Operator.Word */
+.highlight .w { color: #bbbbbb } /* Text.Whitespace */
+.highlight .mb { color: #208050 } /* Literal.Number.Bin */
+.highlight .mf { color: #208050 } /* Literal.Number.Float */
+.highlight .mh { color: #208050 } /* Literal.Number.Hex */
+.highlight .mi { color: #208050 } /* Literal.Number.Integer */
+.highlight .mo { color: #208050 } /* Literal.Number.Oct */
+.highlight .sa { color: #4070a0 } /* Literal.String.Affix */
+.highlight .sb { color: #4070a0 } /* Literal.String.Backtick */
+.highlight .sc { color: #4070a0 } /* Literal.String.Char */
+.highlight .dl { color: #4070a0 } /* Literal.String.Delimiter */
+.highlight .sd { color: #4070a0; font-style: italic } /* Literal.String.Doc */
+.highlight .s2 { color: #4070a0 } /* Literal.String.Double */
+.highlight .se { color: #4070a0; font-weight: bold } /* Literal.String.Escape */
+.highlight .sh { color: #4070a0 } /* Literal.String.Heredoc */
+.highlight .si { color: #70a0d0; font-style: italic } /* Literal.String.Interpol */
+.highlight .sx { color: #c65d09 } /* Literal.String.Other */
+.highlight .sr { color: #235388 } /* Literal.String.Regex */
+.highlight .s1 { color: #4070a0 } /* Literal.String.Single */
+.highlight .ss { color: #517918 } /* Literal.String.Symbol */
+.highlight .bp { color: #007020 } /* Name.Builtin.Pseudo */
+.highlight .fm { color: #06287e } /* Name.Function.Magic */
+.highlight .vc { color: #bb60d5 } /* Name.Variable.Class */
+.highlight .vg { color: #bb60d5 } /* Name.Variable.Global */
+.highlight .vi { color: #bb60d5 } /* Name.Variable.Instance */
+.highlight .vm { color: #bb60d5 } /* Name.Variable.Magic */
+.highlight .il { color: #208050 } /* Literal.Number.Integer.Long */
\ No newline at end of file
diff --git a/_static/pytorch-logo-dark.svg b/_static/pytorch-logo-dark.svg
new file mode 100755
index 00000000000..5e530003858
--- /dev/null
+++ b/_static/pytorch-logo-dark.svg
@@ -0,0 +1,33 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!-- Generator: Adobe Illustrator 22.1.0, SVG Export Plug-In . SVG Version: 6.00 Build 0)  -->
+<svg version="1.1" id="Layer_1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px" y="0px"
+	 viewBox="0 0 199.7 40.2" style="enable-background:new 0 0 199.7 40.2;" xml:space="preserve">
+<style type="text/css">
+	.st0{fill:#EE4C2C;}
+	.st1{fill:#252525;}
+</style>
+<g>
+	<path class="st0" d="M40.8,9.3l-2.1,2.1c3.5,3.5,3.5,9.2,0,12.7c-3.5,3.5-9.2,3.5-12.7,0c-3.5-3.5-3.5-9.2,0-12.7l0,0l5.6-5.6
+		L32.3,5l0,0V0.8l-8.5,8.5c-4.7,4.7-4.7,12.2,0,16.9s12.2,4.7,16.9,0C45.5,21.5,45.5,13.9,40.8,9.3z"/>
+	<circle class="st0" cx="36.6" cy="7.1" r="1.6"/>
+</g>
+<g>
+	<g>
+		<path class="st1" d="M62.6,20l-3.6,0v9.3h-2.7V2.9c0,0,6.3,0,6.6,0c7,0,10.3,3.4,10.3,8.3C73.2,17,69.1,19.9,62.6,20z M62.8,5.4
+			c-0.3,0-3.9,0-3.9,0v12.1l3.8-0.1c5-0.1,7.7-2.1,7.7-6.2C70.4,7.5,67.8,5.4,62.8,5.4z"/>
+		<path class="st1" d="M85.4,29.2l-1.6,4.2c-1.8,4.7-3.6,6.1-6.3,6.1c-1.5,0-2.6-0.4-3.8-0.9l0.8-2.4c0.9,0.5,1.9,0.8,3,0.8
+			c1.5,0,2.6-0.8,4-4.5l1.3-3.4L75.3,10h2.8l6.1,16l6-16h2.7L85.4,29.2z"/>
+		<path class="st1" d="M101.9,5.5v23.9h-2.7V5.5h-9.3V2.9h21.3v2.5H101.9z"/>
+		<path class="st1" d="M118.8,29.9c-5.4,0-9.4-4-9.4-10.2c0-6.2,4.1-10.3,9.6-10.3c5.4,0,9.3,4,9.3,10.2
+			C128.3,25.8,124.2,29.9,118.8,29.9z M118.9,11.8c-4.1,0-6.8,3.3-6.8,7.8c0,4.7,2.8,7.9,6.9,7.9s6.8-3.3,6.8-7.8
+			C125.8,15,123,11.8,118.9,11.8z"/>
+		<path class="st1" d="M135,29.4h-2.6V10l2.6-0.5v4.1c1.3-2.5,3.2-4.1,5.7-4.1c1.3,0,2.5,0.4,3.4,0.9l-0.7,2.5
+			c-0.8-0.5-1.9-0.8-3-0.8c-2,0-3.9,1.5-5.5,5V29.4z"/>
+		<path class="st1" d="M154.4,29.9c-5.8,0-9.5-4.2-9.5-10.2c0-6.1,4-10.3,9.5-10.3c2.4,0,4.4,0.6,6.1,1.7l-0.7,2.4
+			c-1.5-1-3.3-1.6-5.4-1.6c-4.2,0-6.8,3.1-6.8,7.7c0,4.7,2.8,7.8,6.9,7.8c1.9,0,3.9-0.6,5.4-1.6l0.5,2.4
+			C158.7,29.3,156.6,29.9,154.4,29.9z"/>
+		<path class="st1" d="M176.7,29.4V16.9c0-3.4-1.4-4.9-4.1-4.9c-2.2,0-4.4,1.1-6,2.8v14.7h-2.6V0.9l2.6-0.5c0,0,0,12.1,0,12.2
+			c2-2,4.6-3.1,6.7-3.1c3.8,0,6.1,2.4,6.1,6.6v13.3H176.7z"/>
+	</g>
+</g>
+</svg>
diff --git a/_static/up-pressed.png b/_static/up-pressed.png
new file mode 100755
index 00000000000..acee3b68efb
Binary files /dev/null and b/_static/up-pressed.png differ
diff --git a/_static/up.png b/_static/up.png
new file mode 100755
index 00000000000..2a940a7da7c
Binary files /dev/null and b/_static/up.png differ
diff --git a/advanced_source/ONNXLive.rst b/advanced_source/ONNXLive.rst
new file mode 100644
index 00000000000..7177522c968
--- /dev/null
+++ b/advanced_source/ONNXLive.rst
@@ -0,0 +1,12 @@
+
+ONNX Live Tutorial
+==================
+
+This tutorial has been deprecated.
+
+Redirecting in 3 seconds...
+
+
+.. raw:: html
+
+   <meta http-equiv="Refresh" content="3; url='https://pytorch.org/tutorials/index.html'" />
diff --git a/advanced_source/README.txt b/advanced_source/README.txt
index 300dd1d5b35..56f01688089 100644
--- a/advanced_source/README.txt
+++ b/advanced_source/README.txt
@@ -3,12 +3,8 @@ Advanced Tutorials
 
 1. neural_style_tutorial.py
 	Neural Transfer with PyTorch
-	http://pytorch.org/tutorials/advanced/neural_style_tutorial.html
+	https://pytorch.org/tutorials/advanced/neural_style_tutorial.html
 
 2. numpy_extensions_tutorial.py
-	Creating extensions using numpy and scipy
-	http://pytorch.org/tutorials/advanced/numpy_extensions_tutorial.html
-
-3. c_extension.rst
-	Custom C extensions for pytorch
-	http://pytorch.org/tutorials/advanced/c_extension.html
+	Creating Extensions Using numpy and scipy
+	https://pytorch.org/tutorials/advanced/numpy_extensions_tutorial.html
diff --git a/advanced_source/c_extension.rst b/advanced_source/c_extension.rst
deleted file mode 100644
index 7574f2e4660..00000000000
--- a/advanced_source/c_extension.rst
+++ /dev/null
@@ -1,130 +0,0 @@
-Custom C extensions for pytorch
-===============================
-**Author**: `Soumith Chintala <http://soumith.ch>`_
-
-
-Step 1. prepare your C code
----------------------------
-
-First, you have to write your C functions.
-
-Below you can find an example implementation of forward and backward
-functions of a module that adds its both inputs.
-
-In your ``.c`` files you can include TH using an ``#include <TH/TH.h>``
-directive, and THC using ``#include <THC/THC.h>``.
-
-ffi utils will make sure a compiler can find them during the build.
-
-.. code:: C
-
-    /* src/my_lib.c */
-    #include <TH/TH.h>
-
-    int my_lib_add_forward(THFloatTensor *input1, THFloatTensor *input2,
-    THFloatTensor *output)
-    {
-        if (!THFloatTensor_isSameSizeAs(input1, input2))
-            return 0;
-        THFloatTensor_resizeAs(output, input1);
-        THFloatTensor_add(output, input1, input2);
-        return 1;
-    }
-
-    int my_lib_add_backward(THFloatTensor *grad_output, THFloatTensor *grad_input)
-    {
-        THFloatTensor_resizeAs(grad_input, grad_output);
-        THFloatTensor_fill(grad_input, 1);
-        return 1;
-    }
-
-
-There are no constraints on the code, except that you will have to
-prepare a single header, which will list all functions want to call from
-python.
-
-It will be used by the ffi utils to generate appropriate wrappers.
-
-.. code:: C
-
-    /* src/my_lib.h */
-    int my_lib_add_forward(THFloatTensor *input1, THFloatTensor *input2, THFloatTensor *output);
-    int my_lib_add_backward(THFloatTensor *grad_output, THFloatTensor *grad_input);
-
-Now, you’ll need a super short file, that will build your custom
-extension:
-
-.. code:: python
-
-    # build.py
-    from torch.utils.ffi import create_extension
-    ffi = create_extension(
-    name='_ext.my_lib',
-    headers='src/my_lib.h',
-    sources=['src/my_lib.c'],
-    with_cuda=False
-    )
-    ffi.build()
-
-Step 2: Include it in your Python code
---------------------------------------
-
-After you run it, pytorch will create an ``_ext`` directory and put
-``my_lib`` inside.
-
-Package name can have an arbitrary number of packages preceding the
-final module name (including none). If the build succeeded you can
-import your extension just like a regular python file.
-
-.. code:: python
-
-    # functions/add.py
-    import torch
-    from torch.autograd import Function
-    from _ext import my_lib
-
-
-    class MyAddFunction(Function):
-        def forward(self, input1, input2):
-            output = torch.FloatTensor()
-            my_lib.my_lib_add_forward(input1, input2, output)
-            return output
-
-        def backward(self, grad_output):
-            grad_input = torch.FloatTensor()
-            my_lib.my_lib_add_backward(grad_output, grad_input)
-            return grad_input
-
-.. code:: python
-
-    # modules/add.py
-    from torch.nn import Module
-    from functions.add import MyAddFunction
-
-    class MyAddModule(Module):
-        def forward(self, input1, input2):
-            return MyAddFunction()(input1, input2)
-
-
-.. code:: python
-
-    # main.py
-    import torch.nn as nn
-    from torch.autograd import Variable
-    from modules.add import MyAddModule
-
-    class MyNetwork(nn.Module):
-        def __init__(self):
-            super(MyNetwork, self).__init__(
-                add=MyAddModule(),
-            )
-
-        def forward(self, input1, input2):
-            return self.add(input1, input2)
-
-    model = MyNetwork()
-    input1, input2 = Variable(torch.randn(5, 5)), Variable(torch.randn(5, 5))
-    print(model(input1, input2))
-    print(input1 + input2)
-
-
diff --git a/advanced_source/coding_ddpg.py b/advanced_source/coding_ddpg.py
new file mode 100644
index 00000000000..90ea4565dab
--- /dev/null
+++ b/advanced_source/coding_ddpg.py
@@ -0,0 +1,1220 @@
+# -*- coding: utf-8 -*-
+"""
+TorchRL objectives: Coding a DDPG loss
+======================================
+**Author**: `Vincent Moens <https://github.com/vmoens>`_
+
+"""
+
+##############################################################################
+# Overview
+# --------
+#
+# TorchRL separates the training of RL algorithms in various pieces that will be
+# assembled in your training script: the environment, the data collection and
+# storage, the model and finally the loss function.
+#
+# TorchRL losses (or "objectives") are stateful objects that contain the
+# trainable parameters (policy and value models).
+# This tutorial will guide you through the steps to code a loss from the ground up
+# using TorchRL.
+#
+# To this aim, we will be focusing on DDPG, which is a relatively straightforward
+# algorithm to code.
+# `Deep Deterministic Policy Gradient <https://arxiv.org/abs/1509.02971>`_ (DDPG)
+# is a simple continuous control algorithm. It consists in learning a
+# parametric value function for an action-observation pair, and
+# then learning a policy that outputs actions that maximize this value
+# function given a certain observation.
+#
+# What you will learn:
+#
+# - how to write a loss module and customize its value estimator;
+# - how to build an environment in TorchRL, including transforms
+#   (for example, data normalization) and parallel execution;
+# - how to design a policy and value network;
+# - how to collect data from your environment efficiently and store them
+#   in a replay buffer;
+# - how to store trajectories (and not transitions) in your replay buffer);
+# - how to evaluate your model.
+#
+# Prerequisites
+# ~~~~~~~~~~~~~
+#
+# This tutorial assumes that you have completed the
+# `PPO tutorial <reinforcement_ppo.html>`_ which gives
+# an overview of the TorchRL components and dependencies, such as
+# :class:`tensordict.TensorDict` and :class:`tensordict.nn.TensorDictModules`,
+# although it should be
+# sufficiently transparent to be understood without a deep understanding of
+# these classes.
+#
+# .. note::
+#   We do not aim at giving a SOTA implementation of the algorithm, but rather
+#   to provide a high-level illustration of TorchRL's loss implementations
+#   and the library features that are to be used in the context of
+#   this algorithm.
+#
+# Imports and setup
+# -----------------
+#
+#  .. code-block:: bash
+#
+#      %%bash
+#      pip3 install torchrl mujoco glfw
+
+# sphinx_gallery_start_ignore
+import warnings
+
+warnings.filterwarnings("ignore")
+from torch import multiprocessing
+
+# TorchRL prefers spawn method, that restricts creation of  ``~torchrl.envs.ParallelEnv`` inside
+# `__main__` method call, but for the easy of reading the code switch to fork
+# which is also a default spawn method in Google's Colaboratory
+try:
+    multiprocessing.set_start_method("fork")
+except RuntimeError:
+    pass
+
+# sphinx_gallery_end_ignore
+
+
+import torch
+import tqdm
+
+
+###############################################################################
+# We will execute the policy on CUDA if available
+is_fork = multiprocessing.get_start_method() == "fork"
+device = (
+    torch.device(0)
+    if torch.cuda.is_available() and not is_fork
+    else torch.device("cpu")
+)
+collector_device = torch.device("cpu")  # Change the device to ``cuda`` to use CUDA
+
+###############################################################################
+# TorchRL :class:`~torchrl.objectives.LossModule`
+# -----------------------------------------------
+#
+# TorchRL provides a series of losses to use in your training scripts.
+# The aim is to have losses that are easily reusable/swappable and that have
+# a simple signature.
+#
+# The main characteristics of TorchRL losses are:
+#
+# - They are stateful objects: they contain a copy of the trainable parameters
+#   such that ``loss_module.parameters()`` gives whatever is needed to train the
+#   algorithm.
+# - They follow the ``TensorDict`` convention: the :meth:`torch.nn.Module.forward`
+#   method will receive a TensorDict as input that contains all the necessary
+#   information to return a loss value.
+#
+#       >>> data = replay_buffer.sample()
+#       >>> loss_dict = loss_module(data)
+#
+# - They output a :class:`tensordict.TensorDict` instance with the loss values
+#   written under a ``"loss_<smth>"`` where ``smth`` is a string describing the
+#   loss. Additional keys in the ``TensorDict`` may be useful metrics to log during
+#   training time.
+#
+#   .. note::
+#     The reason we return independent losses is to let the user use a different
+#     optimizer for different sets of parameters for instance. Summing the losses
+#     can be simply done via
+#
+#       >>> loss_val = sum(loss for key, loss in loss_dict.items() if key.startswith("loss_"))
+#
+# The ``__init__`` method
+# ~~~~~~~~~~~~~~~~~~~~~~~
+#
+# The parent class of all losses is :class:`~torchrl.objectives.LossModule`.
+# As many other components of the library, its :meth:`~torchrl.objectives.LossModule.forward` method expects
+# as input a :class:`tensordict.TensorDict` instance sampled from an experience
+# replay buffer, or any similar data structure. Using this format makes it
+# possible to re-use the module across
+# modalities, or in complex settings where the model needs to read multiple
+# entries for instance. In other words, it allows us to code a loss module that
+# is oblivious to the data type that is being given to is and that focuses on
+# running the elementary steps of the loss function and only those.
+#
+# To keep the tutorial as didactic as we can, we'll be displaying each method
+# of the class independently and we'll be populating the class at a later
+# stage.
+#
+# Let us start with the :meth:`~torchrl.objectives.LossModule.__init__`
+# method. DDPG aims at solving a control task with a simple strategy:
+# training a policy to output actions that maximize the value predicted by
+# a value network. Hence, our loss module needs to receive two networks in its
+# constructor: an actor and a value networks. We expect both of these to be
+# TensorDict-compatible objects, such as
+# :class:`tensordict.nn.TensorDictModule`.
+# Our loss function will need to compute a target value and fit the value
+# network to this, and generate an action and fit the policy such that its
+# value estimate is maximized.
+#
+# The crucial step of the :meth:`LossModule.__init__` method is the call to
+# :meth:`~torchrl.LossModule.convert_to_functional`. This method will extract
+# the parameters from the module and convert it to a functional module.
+# Strictly speaking, this is not necessary and one may perfectly code all
+# the losses without it. However, we encourage its usage for the following
+# reason.
+#
+# The reason TorchRL does this is that RL algorithms often execute the same
+# model with different sets of parameters, called "trainable" and "target"
+# parameters.
+# The "trainable" parameters are those that the optimizer needs to fit. The
+# "target" parameters are usually a copy of the former's with some time lag
+# (absolute or diluted through a moving average).
+# These target parameters are used to compute the value associated with the
+# next observation. One the advantages of using a set of target parameters
+# for the value model that do not match exactly the current configuration is
+# that they provide a pessimistic bound on the value function being computed.
+# Pay attention to the ``create_target_params`` keyword argument below: this
+# argument tells the :meth:`~torchrl.objectives.LossModule.convert_to_functional`
+# method to create a set of target parameters in the loss module to be used
+# for target value computation. If this is set to ``False`` (see the actor network
+# for instance) the ``target_actor_network_params`` attribute will still be
+# accessible but this will just return a **detached** version of the
+# actor parameters.
+#
+# Later, we will see how the target parameters should be updated in TorchRL.
+#
+
+from tensordict.nn import TensorDictModule, TensorDictSequential
+
+
+def _init(
+    self,
+    actor_network: TensorDictModule,
+    value_network: TensorDictModule,
+) -> None:
+    super(type(self), self).__init__()
+
+    self.convert_to_functional(
+        actor_network,
+        "actor_network",
+        create_target_params=True,
+    )
+    self.convert_to_functional(
+        value_network,
+        "value_network",
+        create_target_params=True,
+        compare_against=list(actor_network.parameters()),
+    )
+
+    self.actor_in_keys = actor_network.in_keys
+
+    # Since the value we'll be using is based on the actor and value network,
+    # we put them together in a single actor-critic container.
+    actor_critic = ActorCriticWrapper(actor_network, value_network)
+    self.actor_critic = actor_critic
+    self.loss_function = "l2"
+
+
+###############################################################################
+# The value estimator loss method
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# In many RL algorithm, the value network (or Q-value network) is trained based
+# on an empirical value estimate. This can be bootstrapped (TD(0), low
+# variance, high bias), meaning
+# that the target value is obtained using the next reward and nothing else, or
+# a Monte-Carlo estimate can be obtained (TD(1)) in which case the whole
+# sequence of upcoming rewards will be used (high variance, low bias). An
+# intermediate estimator (TD(:math:`\lambda`)) can also be used to compromise
+# bias and variance.
+# TorchRL makes it easy to use one or the other estimator via the
+# :class:`~torchrl.objectives.utils.ValueEstimators` Enum class, which contains
+# pointers to all the value estimators implemented. Let us define the default
+# value function here. We will take the simplest version (TD(0)), and show later
+# on how this can be changed.
+
+from torchrl.objectives.utils import ValueEstimators
+
+default_value_estimator = ValueEstimators.TD0
+
+###############################################################################
+# We also need to give some instructions to DDPG on how to build the value
+# estimator, depending on the user query. Depending on the estimator provided,
+# we will build the corresponding module to be used at train time:
+
+from torchrl.objectives.utils import default_value_kwargs
+from torchrl.objectives.value import TD0Estimator, TD1Estimator, TDLambdaEstimator
+
+
+def make_value_estimator(self, value_type: ValueEstimators, **hyperparams):
+    hp = dict(default_value_kwargs(value_type))
+    if hasattr(self, "gamma"):
+        hp["gamma"] = self.gamma
+    hp.update(hyperparams)
+    value_key = "state_action_value"
+    if value_type == ValueEstimators.TD1:
+        self._value_estimator = TD1Estimator(value_network=self.actor_critic, **hp)
+    elif value_type == ValueEstimators.TD0:
+        self._value_estimator = TD0Estimator(value_network=self.actor_critic, **hp)
+    elif value_type == ValueEstimators.GAE:
+        raise NotImplementedError(
+            f"Value type {value_type} it not implemented for loss {type(self)}."
+        )
+    elif value_type == ValueEstimators.TDLambda:
+        self._value_estimator = TDLambdaEstimator(value_network=self.actor_critic, **hp)
+    else:
+        raise NotImplementedError(f"Unknown value type {value_type}")
+    self._value_estimator.set_keys(value=value_key)
+
+
+###############################################################################
+# The ``make_value_estimator`` method can but does not need to be called: if
+# not, the :class:`~torchrl.objectives.LossModule` will query this method with
+# its default estimator.
+#
+# The actor loss method
+# ~~~~~~~~~~~~~~~~~~~~~
+#
+# The central piece of an RL algorithm is the training loss for the actor.
+# In the case of DDPG, this function is quite simple: we just need to compute
+# the value associated with an action computed using the policy and optimize
+# the actor weights to maximize this value.
+#
+# When computing this value, we must make sure to take the value parameters out
+# of the graph, otherwise the actor and value loss will be mixed up.
+# For this, the :func:`~torchrl.objectives.utils.hold_out_params` function
+# can be used.
+
+
+def _loss_actor(
+    self,
+    tensordict,
+) -> torch.Tensor:
+    td_copy = tensordict.select(*self.actor_in_keys)
+    # Get an action from the actor network: since we made it functional, we need to pass the params
+    with self.actor_network_params.to_module(self.actor_network):
+        td_copy = self.actor_network(td_copy)
+    # get the value associated with that action
+    with self.value_network_params.detach().to_module(self.value_network):
+        td_copy = self.value_network(td_copy)
+    return -td_copy.get("state_action_value")
+
+
+###############################################################################
+# The value loss method
+# ~~~~~~~~~~~~~~~~~~~~~
+#
+# We now need to optimize our value network parameters.
+# To do this, we will rely on the value estimator of our class:
+#
+
+from torchrl.objectives.utils import distance_loss
+
+
+def _loss_value(
+    self,
+    tensordict,
+):
+    td_copy = tensordict.clone()
+
+    # V(s, a)
+    with self.value_network_params.to_module(self.value_network):
+        self.value_network(td_copy)
+    pred_val = td_copy.get("state_action_value").squeeze(-1)
+
+    # we manually reconstruct the parameters of the actor-critic, where the first
+    # set of parameters belongs to the actor and the second to the value function.
+    target_params = TensorDict(
+        {
+            "module": {
+                "0": self.target_actor_network_params,
+                "1": self.target_value_network_params,
+            }
+        },
+        batch_size=self.target_actor_network_params.batch_size,
+        device=self.target_actor_network_params.device,
+    )
+    with target_params.to_module(self.actor_critic):
+        target_value = self.value_estimator.value_estimate(tensordict).squeeze(-1)
+
+    # Computes the value loss: L2, L1 or smooth L1 depending on `self.loss_function`
+    loss_value = distance_loss(pred_val, target_value, loss_function=self.loss_function)
+    td_error = (pred_val - target_value).pow(2)
+
+    return loss_value, td_error, pred_val, target_value
+
+
+###############################################################################
+# Putting things together in a forward call
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# The only missing piece is the forward method, which will glue together the
+# value and actor loss, collect the cost values and write them in a ``TensorDict``
+# delivered to the user.
+
+from tensordict import TensorDict, TensorDictBase
+
+
+def _forward(self, input_tensordict: TensorDictBase) -> TensorDict:
+    loss_value, td_error, pred_val, target_value = self.loss_value(
+        input_tensordict,
+    )
+    td_error = td_error.detach()
+    td_error = td_error.unsqueeze(input_tensordict.ndimension())
+    if input_tensordict.device is not None:
+        td_error = td_error.to(input_tensordict.device)
+    input_tensordict.set(
+        "td_error",
+        td_error,
+        inplace=True,
+    )
+    loss_actor = self.loss_actor(input_tensordict)
+    return TensorDict(
+        source={
+            "loss_actor": loss_actor.mean(),
+            "loss_value": loss_value.mean(),
+            "pred_value": pred_val.mean().detach(),
+            "target_value": target_value.mean().detach(),
+            "pred_value_max": pred_val.max().detach(),
+            "target_value_max": target_value.max().detach(),
+        },
+        batch_size=[],
+    )
+
+
+from torchrl.objectives import LossModule
+
+
+class DDPGLoss(LossModule):
+    default_value_estimator = default_value_estimator
+    make_value_estimator = make_value_estimator
+
+    __init__ = _init
+    forward = _forward
+    loss_value = _loss_value
+    loss_actor = _loss_actor
+
+
+###############################################################################
+# Now that we have our loss, we can use it to train a policy to solve a
+# control task.
+#
+# Environment
+# -----------
+#
+# In most algorithms, the first thing that needs to be taken care of is the
+# construction of the environment as it conditions the remainder of the
+# training script.
+#
+# For this example, we will be using the ``"cheetah"`` task. The goal is to make
+# a half-cheetah run as fast as possible.
+#
+# In TorchRL, one can create such a task by relying on ``dm_control`` or ``gym``:
+#
+# .. code-block:: python
+#
+#    env = GymEnv("HalfCheetah-v4")
+#
+# or
+#
+# .. code-block:: python
+#
+#    env = DMControlEnv("cheetah", "run")
+#
+# By default, these environment disable rendering. Training from states is
+# usually easier than training from images. To keep things simple, we focus
+# on learning from states only. To pass the pixels to the ``tensordicts`` that
+# are collected by :func:`env.step()`, simply pass the ``from_pixels=True``
+# argument to the constructor:
+#
+# .. code-block:: python
+#
+#    env = GymEnv("HalfCheetah-v4", from_pixels=True, pixels_only=True)
+#
+# We write a :func:`make_env` helper function that will create an environment
+# with either one of the two backends considered above (``dm-control`` or ``gym``).
+#
+
+from torchrl.envs.libs.dm_control import DMControlEnv
+from torchrl.envs.libs.gym import GymEnv
+
+env_library = None
+env_name = None
+
+
+def make_env(from_pixels=False):
+    """Create a base ``env``."""
+    global env_library
+    global env_name
+
+    if backend == "dm_control":
+        env_name = "cheetah"
+        env_task = "run"
+        env_args = (env_name, env_task)
+        env_library = DMControlEnv
+    elif backend == "gym":
+        env_name = "HalfCheetah-v4"
+        env_args = (env_name,)
+        env_library = GymEnv
+    else:
+        raise NotImplementedError
+
+    env_kwargs = {
+        "device": device,
+        "from_pixels": from_pixels,
+        "pixels_only": from_pixels,
+        "frame_skip": 2,
+    }
+    env = env_library(*env_args, **env_kwargs)
+    return env
+
+
+###############################################################################
+# Transforms
+# ~~~~~~~~~~
+#
+# Now that we have a base environment, we may want to modify its representation
+# to make it more policy-friendly. In TorchRL, transforms are appended to the
+# base environment in a specialized :class:`torchr.envs.TransformedEnv` class.
+#
+# - It is common in DDPG to rescale the reward using some heuristic value. We
+#   will multiply the reward by 5 in this example.
+#
+# - If we are using :mod:`dm_control`, it is also important to build an interface
+#   between the simulator which works with double precision numbers, and our
+#   script which presumably uses single precision ones. This transformation goes
+#   both ways: when calling :func:`env.step`, our actions will need to be
+#   represented in double precision, and the output will need to be transformed
+#   to single precision.
+#   The :class:`~torchrl.envs.DoubleToFloat` transform does exactly this: the
+#   ``in_keys`` list refers to the keys that will need to be transformed from
+#   double to float, while the ``in_keys_inv`` refers to those that need to
+#   be transformed to double before being passed to the environment.
+#
+# - We concatenate the state keys together using the :class:`~torchrl.envs.CatTensors`
+#   transform.
+#
+# - Finally, we also leave the possibility of normalizing the states: we will
+#   take care of computing the normalizing constants later on.
+#
+
+from torchrl.envs import (
+    CatTensors,
+    DoubleToFloat,
+    EnvCreator,
+    InitTracker,
+    ObservationNorm,
+    ParallelEnv,
+    RewardScaling,
+    StepCounter,
+    TransformedEnv,
+)
+
+
+def make_transformed_env(
+    env,
+):
+    """Apply transforms to the ``env`` (such as reward scaling and state normalization)."""
+
+    env = TransformedEnv(env)
+
+    # we append transforms one by one, although we might as well create the
+    # transformed environment using the `env = TransformedEnv(base_env, transforms)`
+    # syntax.
+    env.append_transform(RewardScaling(loc=0.0, scale=reward_scaling))
+
+    # We concatenate all states into a single "observation_vector"
+    # even if there is a single tensor, it'll be renamed in "observation_vector".
+    # This facilitates the downstream operations as we know the name of the
+    # output tensor.
+    # In some environments (not half-cheetah), there may be more than one
+    # observation vector: in this case this code snippet will concatenate them
+    # all.
+    selected_keys = list(env.observation_spec.keys())
+    out_key = "observation_vector"
+    env.append_transform(CatTensors(in_keys=selected_keys, out_key=out_key))
+
+    # we normalize the states, but for now let's just instantiate a stateless
+    # version of the transform
+    env.append_transform(ObservationNorm(in_keys=[out_key], standard_normal=True))
+
+    env.append_transform(DoubleToFloat())
+
+    env.append_transform(StepCounter(max_frames_per_traj))
+
+    # We need a marker for the start of trajectories for our Ornstein-Uhlenbeck (OU)
+    # exploration:
+    env.append_transform(InitTracker())
+
+    return env
+
+
+###############################################################################
+# Parallel execution
+# ~~~~~~~~~~~~~~~~~~
+#
+# The following helper function allows us to run environments in parallel.
+# Running environments in parallel can significantly speed up the collection
+# throughput. When using transformed environment, we need to choose whether we
+# want to execute the transform individually for each environment, or
+# centralize the data and transform it in batch. Both approaches are easy to
+# code:
+#
+# .. code-block:: python
+#
+#    env = ParallelEnv(
+#        lambda: TransformedEnv(GymEnv("HalfCheetah-v4"), transforms),
+#        num_workers=4
+#    )
+#    env = TransformedEnv(
+#        ParallelEnv(lambda: GymEnv("HalfCheetah-v4"), num_workers=4),
+#        transforms
+#    )
+#
+# To leverage the vectorization capabilities of PyTorch, we adopt
+# the first method:
+#
+
+
+def parallel_env_constructor(
+    env_per_collector,
+    transform_state_dict,
+):
+    if env_per_collector == 1:
+
+        def make_t_env():
+            env = make_transformed_env(make_env())
+            env.transform[2].init_stats(3)
+            env.transform[2].loc.copy_(transform_state_dict["loc"])
+            env.transform[2].scale.copy_(transform_state_dict["scale"])
+            return env
+
+        env_creator = EnvCreator(make_t_env)
+        return env_creator
+
+    parallel_env = ParallelEnv(
+        num_workers=env_per_collector,
+        create_env_fn=EnvCreator(lambda: make_env()),
+        create_env_kwargs=None,
+        pin_memory=False,
+    )
+    env = make_transformed_env(parallel_env)
+    # we call `init_stats` for a limited number of steps, just to instantiate
+    # the lazy buffers.
+    env.transform[2].init_stats(3, cat_dim=1, reduce_dim=[0, 1])
+    env.transform[2].load_state_dict(transform_state_dict)
+    return env
+
+
+# The backend can be ``gym`` or ``dm_control``
+backend = "gym"
+
+###############################################################################
+# .. note::
+#
+#   ``frame_skip`` batches multiple step together with a single action
+#   If > 1, the other frame counts (for example, frames_per_batch, total_frames)
+#   need to be adjusted to have a consistent total number of frames collected
+#   across experiments. This is important as raising the frame-skip but keeping the
+#   total number of frames unchanged may seem like cheating: all things compared,
+#   a dataset of 10M elements collected with a frame-skip of 2 and another with
+#   a frame-skip of 1 actually have a ratio of interactions with the environment
+#   of 2:1! In a nutshell, one should be cautious about the frame-count of a
+#   training script when dealing with frame skipping as this may lead to
+#   biased comparisons between training strategies.
+#
+# Scaling the reward helps us control the signal magnitude for a more
+# efficient learning.
+reward_scaling = 5.0
+
+###############################################################################
+# We also define when a trajectory will be truncated. A thousand steps (500 if
+# frame-skip = 2) is a good number to use for the cheetah task:
+
+max_frames_per_traj = 500
+
+###############################################################################
+# Normalization of the observations
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# To compute the normalizing statistics, we run an arbitrary number of random
+# steps in the environment and compute the mean and standard deviation of the
+# collected observations. The :func:`ObservationNorm.init_stats()` method can
+# be used for this purpose. To get the summary statistics, we create a dummy
+# environment and run it for a given number of steps, collect data over a given
+# number of steps and compute its summary statistics.
+#
+
+
+def get_env_stats():
+    """Gets the stats of an environment."""
+    proof_env = make_transformed_env(make_env())
+    t = proof_env.transform[2]
+    t.init_stats(init_env_steps)
+    transform_state_dict = t.state_dict()
+    proof_env.close()
+    return transform_state_dict
+
+
+###############################################################################
+# Normalization stats
+# ~~~~~~~~~~~~~~~~~~~
+# Number of random steps used as for stats computation using ``ObservationNorm``
+
+init_env_steps = 5000
+
+transform_state_dict = get_env_stats()
+
+###############################################################################
+# Number of environments in each data collector
+env_per_collector = 4
+
+###############################################################################
+# We pass the stats computed earlier to normalize the output of our
+# environment:
+
+parallel_env = parallel_env_constructor(
+    env_per_collector=env_per_collector,
+    transform_state_dict=transform_state_dict,
+)
+
+
+from torchrl.data import CompositeSpec
+
+###############################################################################
+# Building the model
+# ------------------
+#
+# We now turn to the setup of the model. As we have seen, DDPG requires a
+# value network, trained to estimate the value of a state-action pair, and a
+# parametric actor that learns how to select actions that maximize this value.
+#
+# Recall that building a TorchRL module requires two steps:
+#
+# - writing the :class:`torch.nn.Module` that will be used as network,
+# - wrapping the network in a :class:`tensordict.nn.TensorDictModule` where the
+#   data flow is handled by specifying the input and output keys.
+#
+# In more complex scenarios, :class:`tensordict.nn.TensorDictSequential` can
+# also be used.
+#
+#
+# The Q-Value network is wrapped in a :class:`~torchrl.modules.ValueOperator`
+# that automatically sets the ``out_keys`` to ``"state_action_value`` for q-value
+# networks and ``state_value`` for other value networks.
+#
+# TorchRL provides a built-in version of the DDPG networks as presented in the
+# original paper. These can be found under :class:`~torchrl.modules.DdpgMlpActor`
+# and :class:`~torchrl.modules.DdpgMlpQNet`.
+#
+# Since we use lazy modules, it is necessary to materialize the lazy modules
+# before being able to move the policy from device to device and achieve other
+# operations. Hence, it is good practice to run the modules with a small
+# sample of data. For this purpose, we generate fake data from the
+# environment specs.
+#
+
+from torchrl.modules import (
+    ActorCriticWrapper,
+    DdpgMlpActor,
+    DdpgMlpQNet,
+    OrnsteinUhlenbeckProcessModule,
+    ProbabilisticActor,
+    TanhDelta,
+    ValueOperator,
+)
+
+
+def make_ddpg_actor(
+    transform_state_dict,
+    device="cpu",
+):
+    proof_environment = make_transformed_env(make_env())
+    proof_environment.transform[2].init_stats(3)
+    proof_environment.transform[2].load_state_dict(transform_state_dict)
+
+    out_features = proof_environment.action_spec.shape[-1]
+
+    actor_net = DdpgMlpActor(
+        action_dim=out_features,
+    )
+
+    in_keys = ["observation_vector"]
+    out_keys = ["param"]
+
+    actor = TensorDictModule(
+        actor_net,
+        in_keys=in_keys,
+        out_keys=out_keys,
+    )
+
+    actor = ProbabilisticActor(
+        actor,
+        distribution_class=TanhDelta,
+        in_keys=["param"],
+        spec=CompositeSpec(action=proof_environment.action_spec),
+    ).to(device)
+
+    q_net = DdpgMlpQNet()
+
+    in_keys = in_keys + ["action"]
+    qnet = ValueOperator(
+        in_keys=in_keys,
+        module=q_net,
+    ).to(device)
+
+    # initialize lazy modules
+    qnet(actor(proof_environment.reset().to(device)))
+    return actor, qnet
+
+
+actor, qnet = make_ddpg_actor(
+    transform_state_dict=transform_state_dict,
+    device=device,
+)
+
+###############################################################################
+# Exploration
+# ~~~~~~~~~~~
+#
+# The policy is passed into a :class:`~torchrl.modules.OrnsteinUhlenbeckProcessModule`
+# exploration module, as suggested in the original paper.
+# Let's define the number of frames before OU noise reaches its minimum value
+annealing_frames = 1_000_000
+
+actor_model_explore = TensorDictSequential(
+    actor,
+    OrnsteinUhlenbeckProcessModule(
+        spec=actor.spec.clone(),
+        annealing_num_steps=annealing_frames,
+    ).to(device),
+)
+if device == torch.device("cpu"):
+    actor_model_explore.share_memory()
+
+
+###############################################################################
+# Data collector
+# --------------
+#
+# TorchRL provides specialized classes to help you collect data by executing
+# the policy in the environment. These "data collectors" iteratively compute
+# the action to be executed at a given time, then execute a step in the
+# environment and reset it when required.
+# Data collectors are designed to help developers have a tight control
+# on the number of frames per batch of data, on the (a)sync nature of this
+# collection and on the resources allocated to the data collection (for example
+# GPU, number of workers, and so on).
+#
+# Here we will use
+# :class:`~torchrl.collectors.SyncDataCollector`, a simple, single-process
+# data collector. TorchRL offers other collectors, such as
+# :class:`~torchrl.collectors.MultiaSyncDataCollector`, which executed the
+# rollouts in an asynchronous manner (for example, data will be collected while
+# the policy is being optimized, thereby decoupling the training and
+# data collection).
+#
+# The parameters to specify are:
+#
+# - an environment factory or an environment,
+# - the policy,
+# - the total number of frames before the collector is considered empty,
+# - the maximum number of frames per trajectory (useful for non-terminating
+#   environments, like ``dm_control`` ones).
+#
+#   .. note::
+#
+#     The ``max_frames_per_traj`` passed to the collector will have the effect
+#     of registering a new :class:`~torchrl.envs.StepCounter` transform
+#     with the environment used for inference. We can achieve the same result
+#     manually, as we do in this script.
+#
+# One should also pass:
+#
+# - the number of frames in each batch collected,
+# - the number of random steps executed independently from the policy,
+# - the devices used for policy execution
+# - the devices used to store data before the data is passed to the main
+#   process.
+#
+# The total frames we will use during training should be around 1M.
+total_frames = 10_000  # 1_000_000
+
+###############################################################################
+# The number of frames returned by the collector at each iteration of the outer
+# loop is equal to the length of each sub-trajectories times the number of
+# environments run in parallel in each collector.
+#
+# In other words, we expect batches from the collector to have a shape
+# ``[env_per_collector, traj_len]`` where
+# ``traj_len=frames_per_batch/env_per_collector``:
+#
+traj_len = 200
+frames_per_batch = env_per_collector * traj_len
+init_random_frames = 5000
+num_collectors = 2
+
+from torchrl.collectors import SyncDataCollector
+from torchrl.envs import ExplorationType
+
+collector = SyncDataCollector(
+    parallel_env,
+    policy=actor_model_explore,
+    total_frames=total_frames,
+    frames_per_batch=frames_per_batch,
+    init_random_frames=init_random_frames,
+    reset_at_each_iter=False,
+    split_trajs=False,
+    device=collector_device,
+    exploration_type=ExplorationType.RANDOM,
+)
+
+###############################################################################
+# Evaluator: building your recorder object
+# ----------------------------------------
+#
+# As the training data is obtained using some exploration strategy, the true
+# performance of our algorithm needs to be assessed in deterministic mode. We
+# do this using a dedicated class, ``Recorder``, which executes the policy in
+# the environment at a given frequency and returns some statistics obtained
+# from these simulations.
+#
+# The following helper function builds this object:
+from torchrl.trainers import Recorder
+
+
+def make_recorder(actor_model_explore, transform_state_dict, record_interval):
+    base_env = make_env()
+    environment = make_transformed_env(base_env)
+    environment.transform[2].init_stats(
+        3
+    )  # must be instantiated to load the state dict
+    environment.transform[2].load_state_dict(transform_state_dict)
+
+    recorder_obj = Recorder(
+        record_frames=1000,
+        policy_exploration=actor_model_explore,
+        environment=environment,
+        exploration_type=ExplorationType.DETERMINISTIC,
+        record_interval=record_interval,
+    )
+    return recorder_obj
+
+
+###############################################################################
+# We will be recording the performance every 10 batch collected
+record_interval = 10
+
+recorder = make_recorder(
+    actor_model_explore, transform_state_dict, record_interval=record_interval
+)
+
+from torchrl.data.replay_buffers import (
+    LazyMemmapStorage,
+    PrioritizedSampler,
+    RandomSampler,
+    TensorDictReplayBuffer,
+)
+
+###############################################################################
+# Replay buffer
+# -------------
+#
+# Replay buffers come in two flavors: prioritized (where some error signal
+# is used to give a higher likelihood of sampling to some items than others)
+# and regular, circular experience replay.
+#
+# TorchRL replay buffers are composable: one can pick up the storage, sampling
+# and writing strategies. It is also possible to
+# store tensors on physical memory using a memory-mapped array. The following
+# function takes care of creating the replay buffer with the desired
+# hyperparameters:
+#
+
+from torchrl.envs import RandomCropTensorDict
+
+
+def make_replay_buffer(buffer_size, batch_size, random_crop_len, prefetch=3, prb=False):
+    if prb:
+        sampler = PrioritizedSampler(
+            max_capacity=buffer_size,
+            alpha=0.7,
+            beta=0.5,
+        )
+    else:
+        sampler = RandomSampler()
+    replay_buffer = TensorDictReplayBuffer(
+        storage=LazyMemmapStorage(
+            buffer_size,
+            scratch_dir=buffer_scratch_dir,
+        ),
+        batch_size=batch_size,
+        sampler=sampler,
+        pin_memory=False,
+        prefetch=prefetch,
+        transform=RandomCropTensorDict(random_crop_len, sample_dim=1),
+    )
+    return replay_buffer
+
+
+###############################################################################
+# We'll store the replay buffer in a temporary directory on disk
+
+import tempfile
+
+tmpdir = tempfile.TemporaryDirectory()
+buffer_scratch_dir = tmpdir.name
+
+###############################################################################
+# Replay buffer storage and batch size
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# TorchRL replay buffer counts the number of elements along the first dimension.
+# Since we'll be feeding trajectories to our buffer, we need to adapt the buffer
+# size by dividing it by the length of the sub-trajectories yielded by our
+# data collector.
+# Regarding the batch-size, our sampling strategy will consist in sampling
+# trajectories of length ``traj_len=200`` before selecting sub-trajectories
+# or length ``random_crop_len=25`` on which the loss will be computed.
+# This strategy balances the choice of storing whole trajectories of a certain
+# length with the need for providing samples with a sufficient heterogeneity
+# to our loss. The following figure shows the dataflow from a collector
+# that gets 8 frames in each batch with 2 environments run in parallel,
+# feeds them to a replay buffer that contains 1000 trajectories and
+# samples sub-trajectories of 2 time steps each.
+#
+# .. figure:: /_static/img/replaybuffer_traj.png
+#    :alt: Storing trajectories in the replay buffer
+#
+# Let's start with the number of frames stored in the buffer
+
+
+def ceil_div(x, y):
+    return -x // (-y)
+
+
+buffer_size = 1_000_000
+buffer_size = ceil_div(buffer_size, traj_len)
+
+###############################################################################
+# Prioritized replay buffer is disabled by default
+prb = False
+
+###############################################################################
+# We also need to define how many updates we'll be doing per batch of data
+# collected. This is known as the update-to-data or ``UTD`` ratio:
+update_to_data = 64
+
+###############################################################################
+# We'll be feeding the loss with trajectories of length 25:
+random_crop_len = 25
+
+###############################################################################
+# In the original paper, the authors perform one update with a batch of 64
+# elements for each frame collected. Here, we reproduce the same ratio
+# but while realizing several updates at each batch collection. We
+# adapt our batch-size to achieve the same number of update-per-frame ratio:
+
+batch_size = ceil_div(64 * frames_per_batch, update_to_data * random_crop_len)
+
+replay_buffer = make_replay_buffer(
+    buffer_size=buffer_size,
+    batch_size=batch_size,
+    random_crop_len=random_crop_len,
+    prefetch=3,
+    prb=prb,
+)
+
+###############################################################################
+# Loss module construction
+# ------------------------
+#
+# We build our loss module with the actor and ``qnet`` we've just created.
+# Because we have target parameters to update, we _must_ create a target network
+# updater.
+#
+
+gamma = 0.99
+lmbda = 0.9
+tau = 0.001  # Decay factor for the target network
+
+loss_module = DDPGLoss(actor, qnet)
+
+###############################################################################
+# let's use the TD(lambda) estimator!
+loss_module.make_value_estimator(ValueEstimators.TDLambda, gamma=gamma, lmbda=lmbda, device=device)
+
+###############################################################################
+# .. note::
+#   Off-policy usually dictates a TD(0) estimator. Here, we use a TD(:math:`\lambda`)
+#   estimator, which will introduce some bias as the trajectory that follows
+#   a certain state has been collected with an outdated policy.
+#   This trick, as the multi-step trick that can be used during data collection,
+#   are alternative versions of "hacks" that we usually find to work well in
+#   practice despite the fact that they introduce some bias in the return
+#   estimates.
+#
+# Target network updater
+# ~~~~~~~~~~~~~~~~~~~~~~
+#
+# Target networks are a crucial part of off-policy RL algorithms.
+# Updating the target network parameters is made easy thanks to the
+# :class:`~torchrl.objectives.HardUpdate` and :class:`~torchrl.objectives.SoftUpdate`
+# classes. They're built with the loss module as argument, and the update is
+# achieved via a call to `updater.step()` at the appropriate location in the
+# training loop.
+
+from torchrl.objectives.utils import SoftUpdate
+
+target_net_updater = SoftUpdate(loss_module, eps=1 - tau)
+
+###############################################################################
+# Optimizer
+# ~~~~~~~~~
+#
+# Finally, we will use the Adam optimizer for the policy and value network:
+
+from torch import optim
+
+optimizer_actor = optim.Adam(
+    loss_module.actor_network_params.values(True, True), lr=1e-4, weight_decay=0.0
+)
+optimizer_value = optim.Adam(
+    loss_module.value_network_params.values(True, True), lr=1e-3, weight_decay=1e-2
+)
+total_collection_steps = total_frames // frames_per_batch
+
+###############################################################################
+# Time to train the policy
+# ------------------------
+#
+# The training loop is pretty straightforward now that we have built all the
+# modules we need.
+#
+
+rewards = []
+rewards_eval = []
+
+# Main loop
+
+collected_frames = 0
+pbar = tqdm.tqdm(total=total_frames)
+r0 = None
+for i, tensordict in enumerate(collector):
+
+    # update weights of the inference policy
+    collector.update_policy_weights_()
+
+    if r0 is None:
+        r0 = tensordict["next", "reward"].mean().item()
+    pbar.update(tensordict.numel())
+
+    # extend the replay buffer with the new data
+    current_frames = tensordict.numel()
+    collected_frames += current_frames
+    replay_buffer.extend(tensordict.cpu())
+
+    # optimization steps
+    if collected_frames >= init_random_frames:
+        for _ in range(update_to_data):
+            # sample from replay buffer
+            sampled_tensordict = replay_buffer.sample().to(device)
+
+            # Compute loss
+            loss_dict = loss_module(sampled_tensordict)
+
+            # optimize
+            loss_dict["loss_actor"].backward()
+            gn1 = torch.nn.utils.clip_grad_norm_(
+                loss_module.actor_network_params.values(True, True), 10.0
+            )
+            optimizer_actor.step()
+            optimizer_actor.zero_grad()
+
+            loss_dict["loss_value"].backward()
+            gn2 = torch.nn.utils.clip_grad_norm_(
+                loss_module.value_network_params.values(True, True), 10.0
+            )
+            optimizer_value.step()
+            optimizer_value.zero_grad()
+
+            gn = (gn1**2 + gn2**2) ** 0.5
+
+            # update priority
+            if prb:
+                replay_buffer.update_tensordict_priority(sampled_tensordict)
+            # update target network
+            target_net_updater.step()
+
+    rewards.append(
+        (
+            i,
+            tensordict["next", "reward"].mean().item(),
+        )
+    )
+    td_record = recorder(None)
+    if td_record is not None:
+        rewards_eval.append((i, td_record["r_evaluation"].item()))
+    if len(rewards_eval) and collected_frames >= init_random_frames:
+        target_value = loss_dict["target_value"].item()
+        loss_value = loss_dict["loss_value"].item()
+        loss_actor = loss_dict["loss_actor"].item()
+        rn = sampled_tensordict["next", "reward"].mean().item()
+        rs = sampled_tensordict["next", "reward"].std().item()
+        pbar.set_description(
+            f"reward: {rewards[-1][1]: 4.2f} (r0 = {r0: 4.2f}), "
+            f"reward eval: reward: {rewards_eval[-1][1]: 4.2f}, "
+            f"reward normalized={rn :4.2f}/{rs :4.2f}, "
+            f"grad norm={gn: 4.2f}, "
+            f"loss_value={loss_value: 4.2f}, "
+            f"loss_actor={loss_actor: 4.2f}, "
+            f"target value: {target_value: 4.2f}"
+        )
+
+    # update the exploration strategy
+    actor_model_explore[1].step(current_frames)
+
+collector.shutdown()
+del collector
+
+###############################################################################
+# Experiment results
+# ------------------
+#
+# We make a simple plot of the average rewards during training. We can observe
+# that our policy learned quite well to solve the task.
+#
+# .. note::
+#   As already mentioned above, to get a more reasonable performance,
+#   use a greater value for ``total_frames`` for example, 1M.
+
+from matplotlib import pyplot as plt
+
+plt.figure()
+plt.plot(*zip(*rewards), label="training")
+plt.plot(*zip(*rewards_eval), label="eval")
+plt.legend()
+plt.xlabel("iter")
+plt.ylabel("reward")
+plt.tight_layout()
+
+###############################################################################
+# Conclusion
+# ----------
+#
+# In this tutorial, we have learned how to code a loss module in TorchRL given
+# the concrete example of DDPG.
+#
+# The key takeaways are:
+#
+# - How to use the :class:`~torchrl.objectives.LossModule` class to code up a new
+#   loss component;
+# - How to use (or not) a target network, and how to update its parameters;
+# - How to create an optimizer associated with a loss module.
+#
+# Next Steps
+# ----------
+# 
+# To iterate further on this loss module we might consider:
+# 
+# - Using `@dispatch` (see `[Feature] Distpatch IQL loss module <https://github.com/pytorch/rl/pull/1230>`_.)
+# - Allowing flexible TensorDict keys.
+# 
diff --git a/advanced_source/cpp_autograd.rst b/advanced_source/cpp_autograd.rst
new file mode 100644
index 00000000000..51e5e0b358f
--- /dev/null
+++ b/advanced_source/cpp_autograd.rst
@@ -0,0 +1,437 @@
+Autograd in C++ Frontend
+========================
+
+The ``autograd`` package is crucial for building highly flexible and dynamic neural
+networks in PyTorch. Most of the autograd APIs in PyTorch Python frontend are also available
+in C++ frontend, allowing easy translation of autograd code from Python to C++.
+
+In this tutorial explore several examples of doing autograd in PyTorch C++ frontend.
+Note that this tutorial assumes that you already have a basic understanding of
+autograd in Python frontend. If that's not the case, please first read
+`Autograd: Automatic Differentiation <https://pytorch.org/tutorials/beginner/blitz/autograd_tutorial.html>`_.
+
+Basic autograd operations
+-------------------------
+
+(Adapted from `this tutorial <https://pytorch.org/tutorials/beginner/blitz/autograd_tutorial.html#autograd-automatic-differentiation>`_)
+
+Create a tensor and set ``torch::requires_grad()`` to track computation with it
+
+.. code-block:: cpp
+
+  auto x = torch::ones({2, 2}, torch::requires_grad());
+  std::cout << x << std::endl;
+
+Out:
+
+.. code-block:: shell
+
+  1 1
+  1 1
+  [ CPUFloatType{2,2} ]
+
+
+Do a tensor operation:
+
+.. code-block:: cpp
+
+  auto y = x + 2;
+  std::cout << y << std::endl;
+
+Out:
+
+.. code-block:: shell
+
+   3  3
+   3  3
+  [ CPUFloatType{2,2} ]
+
+``y`` was created as a result of an operation, so it has a ``grad_fn``.
+
+.. code-block:: cpp
+
+  std::cout << y.grad_fn()->name() << std::endl;
+
+Out:
+
+.. code-block:: shell
+
+  AddBackward1
+
+Do more operations on ``y``
+
+.. code-block:: cpp
+
+  auto z = y * y * 3;
+  auto out = z.mean();
+  
+  std::cout << z << std::endl;
+  std::cout << z.grad_fn()->name() << std::endl;
+  std::cout << out << std::endl;
+  std::cout << out.grad_fn()->name() << std::endl;
+
+Out:
+
+.. code-block:: shell
+
+   27  27
+   27  27
+  [ CPUFloatType{2,2} ]
+  MulBackward1
+  27
+  [ CPUFloatType{} ]
+  MeanBackward0
+
+
+``.requires_grad_( ... )`` changes an existing tensor's ``requires_grad`` flag in-place.
+
+.. code-block:: cpp
+
+  auto a = torch::randn({2, 2});
+  a = ((a * 3) / (a - 1));
+  std::cout << a.requires_grad() << std::endl;
+  
+  a.requires_grad_(true);
+  std::cout << a.requires_grad() << std::endl;
+  
+  auto b = (a * a).sum();
+  std::cout << b.grad_fn()->name() << std::endl;
+
+Out:
+
+.. code-block:: shell
+
+  false
+  true
+  SumBackward0
+
+Let's backprop now. Because ``out`` contains a single scalar, ``out.backward()``
+is equivalent to ``out.backward(torch::tensor(1.))``.
+
+.. code-block:: cpp
+
+  out.backward();
+
+Print gradients d(out)/dx
+
+.. code-block:: cpp
+
+  std::cout << x.grad() << std::endl;
+
+Out:
+
+.. code-block:: shell
+
+   4.5000  4.5000
+   4.5000  4.5000
+  [ CPUFloatType{2,2} ]
+
+You should have got a matrix of ``4.5``. For explanations on how we arrive at this value,
+please see `the corresponding section in this tutorial <https://pytorch.org/tutorials/beginner/blitz/autograd_tutorial.html#gradients>`_.
+
+Now let's take a look at an example of vector-Jacobian product:
+
+.. code-block:: cpp
+
+  x = torch::randn(3, torch::requires_grad());
+  
+  y = x * 2;
+  while (y.norm().item<double>() < 1000) {
+    y = y * 2;
+  }
+    
+  std::cout << y << std::endl;
+  std::cout << y.grad_fn()->name() << std::endl;
+
+Out:
+
+.. code-block:: shell
+
+  -1021.4020
+    314.6695
+   -613.4944
+  [ CPUFloatType{3} ]
+  MulBackward1
+
+If we want the vector-Jacobian product, pass the vector to ``backward`` as argument:
+
+.. code-block:: cpp
+
+  auto v = torch::tensor({0.1, 1.0, 0.0001}, torch::kFloat);
+  y.backward(v);
+  
+  std::cout << x.grad() << std::endl;
+
+Out:
+
+.. code-block:: shell
+
+    102.4000
+   1024.0000
+      0.1024
+  [ CPUFloatType{3} ]
+
+You can also stop autograd from tracking history on tensors that require gradients
+either by putting ``torch::NoGradGuard`` in a code block
+
+.. code-block:: cpp
+
+  std::cout << x.requires_grad() << std::endl;
+  std::cout << x.pow(2).requires_grad() << std::endl;
+  
+  {
+    torch::NoGradGuard no_grad;
+    std::cout << x.pow(2).requires_grad() << std::endl;
+  }
+
+
+Out:
+
+.. code-block:: shell
+
+  true
+  true
+  false
+
+Or by using ``.detach()`` to get a new tensor with the same content but that does
+not require gradients:
+
+.. code-block:: cpp
+
+  std::cout << x.requires_grad() << std::endl;
+  y = x.detach();
+  std::cout << y.requires_grad() << std::endl;
+  std::cout << x.eq(y).all().item<bool>() << std::endl;
+
+Out:
+
+.. code-block:: shell
+
+  true
+  false
+  true
+
+For more information on C++ tensor autograd APIs such as ``grad`` / ``requires_grad`` /
+``is_leaf`` / ``backward`` / ``detach`` / ``detach_`` / ``register_hook`` / ``retain_grad``,
+please see `the corresponding C++ API docs <https://pytorch.org/cppdocs/api/classat_1_1_tensor.html>`_.
+
+Computing higher-order gradients in C++
+---------------------------------------
+
+One of the applications of higher-order gradients is calculating gradient penalty.
+Let's see an example of it using ``torch::autograd::grad``:
+
+.. code-block:: cpp
+
+  #include <torch/torch.h>
+  
+  auto model = torch::nn::Linear(4, 3);
+  
+  auto input = torch::randn({3, 4}).requires_grad_(true);
+  auto output = model(input);
+  
+  // Calculate loss
+  auto target = torch::randn({3, 3});
+  auto loss = torch::nn::MSELoss()(output, target);
+  
+  // Use norm of gradients as penalty
+  auto grad_output = torch::ones_like(output);
+  auto gradient = torch::autograd::grad({output}, {input}, /*grad_outputs=*/{grad_output}, /*create_graph=*/true)[0];
+  auto gradient_penalty = torch::pow((gradient.norm(2, /*dim=*/1) - 1), 2).mean();
+  
+  // Add gradient penalty to loss
+  auto combined_loss = loss + gradient_penalty;
+  combined_loss.backward();
+  
+  std::cout << input.grad() << std::endl;
+
+Out:
+
+.. code-block:: shell
+
+  -0.1042 -0.0638  0.0103  0.0723
+  -0.2543 -0.1222  0.0071  0.0814
+  -0.1683 -0.1052  0.0355  0.1024
+  [ CPUFloatType{3,4} ]
+
+Please see the documentation for ``torch::autograd::backward``
+(`link <https://pytorch.org/cppdocs/api/function_namespacetorch_1_1autograd_1a1403bf65b1f4f8c8506a9e6e5312d030.html>`_)
+and ``torch::autograd::grad``
+(`link <https://pytorch.org/cppdocs/api/function_namespacetorch_1_1autograd_1ab9fa15dc09a8891c26525fb61d33401a.html>`_)
+for more information on how to use them.
+
+Using custom autograd function in C++
+-------------------------------------
+
+(Adapted from `this tutorial <https://pytorch.org/docs/stable/notes/extending.html#extending-torch-autograd>`_)
+
+Adding a new elementary operation to ``torch::autograd`` requires implementing a new ``torch::autograd::Function``
+subclass for each operation. ``torch::autograd::Function`` s are what ``torch::autograd``
+uses to compute the results and gradients, and encode the operation history. Every
+new function requires you to implement 2 methods: ``forward`` and ``backward``, and
+please see `this link <https://pytorch.org/cppdocs/api/structtorch_1_1autograd_1_1_function.html>`_
+for the detailed requirements.
+
+Below you can find code for a ``Linear`` function from ``torch::nn``:
+
+.. code-block:: cpp
+
+  #include <torch/torch.h>
+  
+  using namespace torch::autograd;
+  
+  // Inherit from Function
+  class LinearFunction : public Function<LinearFunction> {
+   public:
+    // Note that both forward and backward are static functions
+  
+    // bias is an optional argument
+    static torch::Tensor forward(
+        AutogradContext *ctx, torch::Tensor input, torch::Tensor weight, torch::Tensor bias = torch::Tensor()) {
+      ctx->save_for_backward({input, weight, bias});
+      auto output = input.mm(weight.t());
+      if (bias.defined()) {
+        output += bias.unsqueeze(0).expand_as(output);
+      }
+      return output;
+    }
+  
+    static tensor_list backward(AutogradContext *ctx, tensor_list grad_outputs) {
+      auto saved = ctx->get_saved_variables();
+      auto input = saved[0];
+      auto weight = saved[1];
+      auto bias = saved[2];
+  
+      auto grad_output = grad_outputs[0];
+      auto grad_input = grad_output.mm(weight);
+      auto grad_weight = grad_output.t().mm(input);
+      auto grad_bias = torch::Tensor();
+      if (bias.defined()) {
+        grad_bias = grad_output.sum(0);
+      }
+  
+      return {grad_input, grad_weight, grad_bias};
+    }
+  };
+
+Then, we can use the ``LinearFunction`` in the following way:
+
+.. code-block:: cpp
+
+  auto x = torch::randn({2, 3}).requires_grad_();
+  auto weight = torch::randn({4, 3}).requires_grad_();
+  auto y = LinearFunction::apply(x, weight);
+  y.sum().backward();
+  
+  std::cout << x.grad() << std::endl;
+  std::cout << weight.grad() << std::endl;
+
+Out:
+
+.. code-block:: shell
+
+   0.5314  1.2807  1.4864
+   0.5314  1.2807  1.4864
+  [ CPUFloatType{2,3} ]
+   3.7608  0.9101  0.0073
+   3.7608  0.9101  0.0073
+   3.7608  0.9101  0.0073
+   3.7608  0.9101  0.0073
+  [ CPUFloatType{4,3} ]
+
+Here, we give an additional example of a function that is parametrized by non-tensor arguments:
+
+.. code-block:: cpp
+
+  #include <torch/torch.h>
+  
+  using namespace torch::autograd;
+  
+  class MulConstant : public Function<MulConstant> {
+   public:
+    static torch::Tensor forward(AutogradContext *ctx, torch::Tensor tensor, double constant) {
+      // ctx is a context object that can be used to stash information
+      // for backward computation
+      ctx->saved_data["constant"] = constant;
+      return tensor * constant;
+    }
+  
+    static tensor_list backward(AutogradContext *ctx, tensor_list grad_outputs) {
+      // We return as many input gradients as there were arguments.
+      // Gradients of non-tensor arguments to forward must be `torch::Tensor()`.
+      return {grad_outputs[0] * ctx->saved_data["constant"].toDouble(), torch::Tensor()};
+    }
+  };
+
+Then, we can use the ``MulConstant`` in the following way:
+
+.. code-block:: cpp
+
+  auto x = torch::randn({2}).requires_grad_();
+  auto y = MulConstant::apply(x, 5.5);
+  y.sum().backward();
+
+  std::cout << x.grad() << std::endl;
+
+Out:
+
+.. code-block:: shell
+
+   5.5000
+   5.5000
+  [ CPUFloatType{2} ]
+
+For more information on ``torch::autograd::Function``, please see
+`its documentation <https://pytorch.org/cppdocs/api/structtorch_1_1autograd_1_1_function.html>`_.
+
+Translating autograd code from Python to C++
+--------------------------------------------
+
+On a high level, the easiest way to use autograd in C++ is to have working
+autograd code in Python first, and then translate your autograd code from Python to
+C++ using the following table:
+
++--------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| Python                         | C++                                                                                                                                                                    |
++================================+========================================================================================================================================================================+
+| ``torch.autograd.backward``    | ``torch::autograd::backward`` (`link <https://pytorch.org/cppdocs/api/function_namespacetorch_1_1autograd_1a1403bf65b1f4f8c8506a9e6e5312d030.html>`_)                  |
++--------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| ``torch.autograd.grad``        | ``torch::autograd::grad`` (`link <https://pytorch.org/cppdocs/api/function_namespacetorch_1_1autograd_1ab9fa15dc09a8891c26525fb61d33401a.html>`_)                      |
++--------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| ``torch.Tensor.detach``        | ``torch::Tensor::detach`` (`link <https://pytorch.org/cppdocs/api/classat_1_1_tensor.html#_CPPv4NK2at6Tensor6detachEv>`_)                                              |
++--------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| ``torch.Tensor.detach_``       | ``torch::Tensor::detach_`` (`link <https://pytorch.org/cppdocs/api/classat_1_1_tensor.html#_CPPv4NK2at6Tensor7detach_Ev>`_)                                            |
++--------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| ``torch.Tensor.backward``      | ``torch::Tensor::backward`` (`link <https://pytorch.org/cppdocs/api/classat_1_1_tensor.html#_CPPv4NK2at6Tensor8backwardERK6Tensorbb>`_)                                |
++--------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| ``torch.Tensor.register_hook`` | ``torch::Tensor::register_hook`` (`link <https://pytorch.org/cppdocs/api/classat_1_1_tensor.html#_CPPv4I0ENK2at6Tensor13register_hookE18hook_return_void_tI1TERR1T>`_) |
++--------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| ``torch.Tensor.requires_grad`` | ``torch::Tensor::requires_grad_`` (`link <https://pytorch.org/cppdocs/api/classat_1_1_tensor.html#_CPPv4NK2at6Tensor14requires_grad_Eb>`_)                             |
++--------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| ``torch.Tensor.retain_grad``   | ``torch::Tensor::retain_grad`` (`link <https://pytorch.org/cppdocs/api/classat_1_1_tensor.html#_CPPv4NK2at6Tensor11retain_gradEv>`_)                                   |
++--------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| ``torch.Tensor.grad``          | ``torch::Tensor::grad`` (`link <https://pytorch.org/cppdocs/api/classat_1_1_tensor.html#_CPPv4NK2at6Tensor4gradEv>`_)                                                  |
++--------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| ``torch.Tensor.grad_fn``       | ``torch::Tensor::grad_fn`` (`link <https://pytorch.org/cppdocs/api/classat_1_1_tensor.html#_CPPv4NK2at6Tensor7grad_fnEv>`_)                                            |
++--------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| ``torch.Tensor.set_data``      | ``torch::Tensor::set_data`` (`link <https://pytorch.org/cppdocs/api/classat_1_1_tensor.html#_CPPv4NK2at6Tensor8set_dataERK6Tensor>`_)                                  |
++--------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| ``torch.Tensor.data``          | ``torch::Tensor::data`` (`link <https://pytorch.org/cppdocs/api/classat_1_1_tensor.html#_CPPv4NK2at6Tensor4dataEv>`_)                                                  |
++--------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| ``torch.Tensor.output_nr``     | ``torch::Tensor::output_nr`` (`link <https://pytorch.org/cppdocs/api/classat_1_1_tensor.html#_CPPv4NK2at6Tensor9output_nrEv>`_)                                        |
++--------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| ``torch.Tensor.is_leaf``       | ``torch::Tensor::is_leaf`` (`link <https://pytorch.org/cppdocs/api/classat_1_1_tensor.html#_CPPv4NK2at6Tensor7is_leafEv>`_)                                            |
++--------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+
+After translation, most of your Python autograd code should just work in C++.
+If that's not the case, please file a bug report at `GitHub issues <https://github.com/pytorch/pytorch/issues>`_
+and we will fix it as soon as possible.
+
+Conclusion
+----------
+
+You should now have a good overview of PyTorch's C++ autograd API.
+You can find the code examples displayed in this note `here
+<https://github.com/pytorch/examples/tree/master/cpp/autograd>`_. As always, if you run into any
+problems or have questions, you can use our `forum <https://discuss.pytorch.org/>`_
+or `GitHub issues <https://github.com/pytorch/pytorch/issues>`_ to get in touch.
diff --git a/advanced_source/cpp_custom_ops.rst b/advanced_source/cpp_custom_ops.rst
new file mode 100644
index 00000000000..512c39b2a68
--- /dev/null
+++ b/advanced_source/cpp_custom_ops.rst
@@ -0,0 +1,582 @@
+.. _cpp-custom-ops-tutorial:
+
+Custom C++ and CUDA Operators
+=============================
+
+**Author:** `Richard Zou <https://github.com/zou3519>`_
+
+.. grid:: 2
+
+    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn
+       :class-card: card-prerequisites
+
+       * How to integrate custom operators written in C++/CUDA with PyTorch
+       * How to test custom operators using ``torch.library.opcheck``
+
+    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites
+       :class-card: card-prerequisites
+
+       * PyTorch 2.4 or later
+       * Basic understanding of C++ and CUDA programming
+
+.. note::
+
+  This tutorial will also work on AMD ROCm with no additional modifications.
+
+PyTorch offers a large library of operators that work on Tensors (e.g. torch.add, torch.sum, etc).
+However, you may wish to bring a new custom operator to PyTorch. This tutorial demonstrates the
+blessed path to authoring a custom operator written in C++/CUDA.
+
+For our tutorial, we’ll demonstrate how to author a fused multiply-add C++
+and CUDA operator that composes with PyTorch subsystems. The semantics of
+the operation are as follows:
+
+.. code-block:: python
+
+  def mymuladd(a: Tensor, b: Tensor, c: float):
+      return a * b + c
+
+You can find the end-to-end working example for this tutorial
+`here <https://github.com/pytorch/extension-cpp>`_ .
+
+Setting up the Build System
+---------------------------
+
+If you are developing custom C++/CUDA code, it must be compiled.
+Note that if you’re interfacing with a Python library that already has bindings
+to precompiled C++/CUDA code, you might consider writing a custom Python operator
+instead (:ref:`python-custom-ops-tutorial`).
+
+Use `torch.utils.cpp_extension <https://pytorch.org/docs/stable/cpp_extension.html>`_
+to compile custom C++/CUDA code for use with PyTorch
+C++ extensions may be built either "ahead of time" with setuptools, or "just in time"
+via `load_inline <https://pytorch.org/docs/stable/cpp_extension.html#torch.utils.cpp_extension.load_inline>`_;
+we’ll focus on the "ahead of time" flavor.
+
+Using ``cpp_extension`` is as simple as writing the following ``setup.py``:
+
+.. code-block:: python
+
+  from setuptools import setup, Extension
+  from torch.utils import cpp_extension
+
+  setup(name="extension_cpp",
+        ext_modules=[
+            cpp_extension.CppExtension(
+              "extension_cpp",
+              ["muladd.cpp"],
+              # define Py_LIMITED_API with min version 3.9 to expose only the stable
+              # limited API subset from Python.h
+              extra_compile_args={"cxx": ["-DPy_LIMITED_API=0x03090000"]}, 
+              py_limited_api=True)],  # Build 1 wheel across multiple Python versions
+        cmdclass={'build_ext': cpp_extension.BuildExtension},
+        options={"bdist_wheel": {"py_limited_api": "cp39"}}  # 3.9 is minimum supported Python version
+  )
+
+If you need to compile CUDA code (for example, ``.cu`` files), then instead use
+`torch.utils.cpp_extension.CUDAExtension <https://pytorch.org/docs/stable/cpp_extension.html#torch.utils.cpp_extension.CUDAExtension>`_.
+Please see `extension-cpp <https://github.com/pytorch/extension-cpp>`_ for an
+example for how this is set up.
+
+The above example represents what we refer to as a CPython agnostic wheel, meaning we are
+building a single wheel that can be run across multiple CPython versions (similar to pure
+Python packages). CPython agnosticism is desirable in minimizing the number of wheels your
+custom library needs to support and release. The minimum version we'd like to support is
+3.9, since it is the oldest supported version currently, so we use the corresponding hexcode
+and specifier throughout the setup code. We suggest building the extension in the same
+environment as the minimum CPython version you'd like to support to minimize unknown behavior,
+so, here, we build the extension in a CPython 3.9 environment. When built, this single wheel
+will be runnable in any CPython environment 3.9+. To achieve this, there are three key lines
+to note.
+
+The first is the specification of ``Py_LIMITED_API`` in ``extra_compile_args`` to the
+minimum CPython version you would like to support:
+
+.. code-block:: python
+
+  extra_compile_args={"cxx": ["-DPy_LIMITED_API=0x03090000"]},
+
+Defining the ``Py_LIMITED_API`` flag helps verify that the extension is in fact
+only using the `CPython Stable Limited API <https://docs.python.org/3/c-api/stable.html>`_,
+which is a requirement for the building a CPython agnostic wheel. If this requirement
+is not met, it is possible to build a wheel that looks CPython agnostic but will crash,
+or worse, be silently incorrect, in another CPython environment. Take care to avoid
+using unstable CPython APIs, for example APIs from libtorch_python (in particular
+pytorch/python bindings,) and to only use APIs from libtorch (ATen objects, operators
+and the dispatcher). We strongly recommend defining the ``Py_LIMITED_API`` flag to
+help ascertain the extension is compliant and safe as a CPython agnostic wheel. Note that
+defining this flag is not a full guarantee that the built wheel is CPython agnostic, but
+it is better than the wild wild west. There are several caveats mentioned in the
+`Python docs <https://docs.python.org/3/c-api/stable.html#limited-api-caveats>`_,
+and you should test and verify yourself that the wheel is truly agnostic for the relevant
+CPython versions.
+
+The second and third lines specifying ``py_limited_api`` inform setuptools that you intend
+to build a CPython agnostic wheel and will influence the naming of the wheel accordingly:
+
+.. code-block:: python
+
+  setup(name="extension_cpp",
+        ext_modules=[
+            cpp_extension.CppExtension(
+              ...,
+              py_limited_api=True)],  # Build 1 wheel across multiple Python versions
+        ...,
+        options={"bdist_wheel": {"py_limited_api": "cp39"}}  # 3.9 is minimum supported Python version
+  )
+
+It is necessary to specify ``py_limited_api=True`` as an argument to CppExtension/
+CUDAExtension and also as an option to the ``"bdist_wheel"`` command with the minimal
+supported CPython version (in this case, 3.9). Consequently, the ``setup`` in our
+tutorial would build one properly named wheel that could be installed across multiple
+CPython versions ``>=3.9``.
+
+If your extension uses CPython APIs outside the stable limited set, then you cannot
+build a CPython agnostic wheel! You should build one wheel per CPython version instead,
+like so:
+
+.. code-block:: python
+
+  from setuptools import setup, Extension
+  from torch.utils import cpp_extension
+
+  setup(name="extension_cpp",
+        ext_modules=[
+            cpp_extension.CppExtension(
+              "extension_cpp",
+              ["muladd.cpp"])],
+        cmdclass={'build_ext': cpp_extension.BuildExtension},
+  )
+
+
+Defining the custom op and adding backend implementations
+---------------------------------------------------------
+First, let's write a C++ function that computes ``mymuladd``:
+
+.. code-block:: cpp
+
+   at::Tensor mymuladd_cpu(at::Tensor a, const at::Tensor& b, double c) {
+     TORCH_CHECK(a.sizes() == b.sizes());
+     TORCH_CHECK(a.dtype() == at::kFloat);
+     TORCH_CHECK(b.dtype() == at::kFloat);
+     TORCH_INTERNAL_ASSERT(a.device().type() == at::DeviceType::CPU);
+     TORCH_INTERNAL_ASSERT(b.device().type() == at::DeviceType::CPU);
+     at::Tensor a_contig = a.contiguous();
+     at::Tensor b_contig = b.contiguous();
+     at::Tensor result = torch::empty(a_contig.sizes(), a_contig.options());
+     const float* a_ptr = a_contig.data_ptr<float>();
+     const float* b_ptr = b_contig.data_ptr<float>();
+     float* result_ptr = result.data_ptr<float>();
+     for (int64_t i = 0; i < result.numel(); i++) {
+       result_ptr[i] = a_ptr[i] * b_ptr[i] + c;
+     }
+     return result;
+   }
+
+In order to use this from PyTorch’s Python frontend, we need to register it
+as a PyTorch operator using the ``TORCH_LIBRARY`` API. This will automatically
+bind the operator to Python.
+
+Operator registration is a two step-process:
+
+- **Defining the operator** - This step ensures that PyTorch is aware of the new operator.
+- **Registering backend implementations** - In this step, implementations for various
+  backends, such as CPU and CUDA, are associated with the operator.
+
+Defining an operator
+^^^^^^^^^^^^^^^^^^^^
+To define an operator, follow these steps:
+
+1. select a namespace for an operator. We recommend the namespace be the name of your top-level
+   project; we’ll use "extension_cpp" in our tutorial.
+2. provide a schema string that specifies the input/output types of the operator and if an
+   input Tensors will be mutated. We support more types in addition to Tensor and float;
+   please see `The Custom Operators Manual <https://pytorch.org/docs/main/notes/custom_operators.html>`_
+   for more details.
+
+   * If you are authoring an operator that can mutate its input Tensors, please see here
+     (:ref:`mutable-ops`) for how to specify that.
+
+.. code-block:: cpp
+
+  TORCH_LIBRARY(extension_cpp, m) {
+     // Note that "float" in the schema corresponds to the C++ double type
+     // and the Python float type.
+     m.def("mymuladd(Tensor a, Tensor b, float c) -> Tensor");
+   }
+
+This makes the operator available from Python via ``torch.ops.extension_cpp.mymuladd``.
+
+Registering backend implementations for an operator
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Use ``TORCH_LIBRARY_IMPL`` to register a backend implementation for the operator.
+
+.. code-block:: cpp
+
+   TORCH_LIBRARY_IMPL(extension_cpp, CPU, m) {
+     m.impl("mymuladd", &mymuladd_cpu);
+   }
+
+If you also have a CUDA implementation of ``myaddmul``, you can register it
+in a separate ``TORCH_LIBRARY_IMPL`` block:
+
+.. code-block:: cpp
+
+  __global__ void muladd_kernel(int numel, const float* a, const float* b, float c, float* result) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < numel) result[idx] = a[idx] * b[idx] + c;
+  }
+
+  at::Tensor mymuladd_cuda(const at::Tensor& a, const at::Tensor& b, double c) {
+    TORCH_CHECK(a.sizes() == b.sizes());
+    TORCH_CHECK(a.dtype() == at::kFloat);
+    TORCH_CHECK(b.dtype() == at::kFloat);
+    TORCH_INTERNAL_ASSERT(a.device().type() == at::DeviceType::CUDA);
+    TORCH_INTERNAL_ASSERT(b.device().type() == at::DeviceType::CUDA);
+    at::Tensor a_contig = a.contiguous();
+    at::Tensor b_contig = b.contiguous();
+    at::Tensor result = torch::empty(a_contig.sizes(), a_contig.options());
+    const float* a_ptr = a_contig.data_ptr<float>();
+    const float* b_ptr = b_contig.data_ptr<float>();
+    float* result_ptr = result.data_ptr<float>();
+
+    int numel = a_contig.numel();
+    muladd_kernel<<<(numel+255)/256, 256>>>(numel, a_ptr, b_ptr, c, result_ptr);
+    return result;
+  }
+
+  TORCH_LIBRARY_IMPL(extension_cpp, CUDA, m) {
+    m.impl("mymuladd", &mymuladd_cuda);
+  }
+
+Adding ``torch.compile`` support for an operator
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+To add ``torch.compile`` support for an operator, we must add a FakeTensor kernel (also
+known as a "meta kernel" or "abstract impl"). FakeTensors are Tensors that have
+metadata (such as shape, dtype, device) but no data: the FakeTensor kernel for an
+operator specifies how to compute the metadata of output tensors given the metadata of input tensors.
+The FakeTensor kernel should return dummy Tensors of your choice with
+the correct Tensor metadata (shape/strides/``dtype``/device).
+
+We recommend that this be done from Python via the ``torch.library.register_fake`` API,
+though it is possible to do this from C++ as well (see
+`The Custom Operators Manual <https://pytorch.org/docs/main/notes/custom_operators.html>`_
+for more details).
+
+.. code-block:: python
+
+  # Important: the C++ custom operator definitions should be loaded first
+  # before calling ``torch.library`` APIs that add registrations for the
+  # C++ custom operator(s). The following import loads our
+  # C++ custom operator definitions.
+  # Note that if you are striving for Python agnosticism, you should use
+  # the ``load_library(...)`` API call instead. See the next section for
+  # more details.
+  from . import _C
+
+  @torch.library.register_fake("extension_cpp::mymuladd")
+  def _(a, b, c):
+      torch._check(a.shape == b.shape)
+      torch._check(a.dtype == torch.float)
+      torch._check(b.dtype == torch.float)
+      torch._check(a.device == b.device)
+      return torch.empty_like(a)
+
+Setting up hybrid Python/C++ registration
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+In this tutorial, we defined a custom operator in C++, added CPU/CUDA
+implementations in C++, and added ``FakeTensor`` kernels and backward formulas
+in Python. The order in which these registrations are loaded (or imported)
+matters (importing in the wrong order will lead to an error).
+
+To use the custom operator with hybrid Python/C++ registrations, we must
+first load the C++ library that holds the custom operator definition
+and then call the ``torch.library`` registration APIs. This can happen in
+three ways:
+
+
+1. The first way to load the C++ library that holds the custom operator definition
+   is to define a dummy Python module for _C. Then, in Python, when you import the
+   module with ``import _C``, the ``.so`` files corresponding to the extension will
+   be loaded and the ``TORCH_LIBRARY`` and ``TORCH_LIBRARY_IMPL`` static initializers
+   will run. One can create a dummy Python module with ``PYBIND11_MODULE`` like below,
+   but you will notice that this does not compile with ``Py_LIMITED_API``, because
+   ``pybind11`` does not promise to only use the stable limited CPython API! With
+   the below code, you sadly cannot build a CPython agnostic wheel for your extension!
+   (Foreshadowing: I wonder what the second way is ;) ).
+
+.. code-block:: cpp
+
+  // in, say, not_agnostic/csrc/extension_BAD.cpp
+  #include <pybind11/pybind11.h>
+
+  PYBIND11_MODULE("_C", m) {}
+
+.. code-block:: python
+
+  # in, say, extension/__init__.py
+  from . import _C
+
+2. In this tutorial, because we value being able to build a single wheel across multiple
+   CPython versions, we will replace the unstable ``PYBIND11`` call with stable API calls.
+   The below code compiles with ``-DPy_LIMITED_API=0x03090000`` and successfully creates
+   a dummy Python module for our ``_C`` extension so that it can be imported from Python.
+   See `extension_cpp/__init__.py <https://github.com/pytorch/extension-cpp/blob/38ec45e/extension_cpp/__init__.py>`_
+   and `extension_cpp/csrc/muladd.cpp  <https://github.com/pytorch/extension-cpp/blob/38ec45e/extension_cpp/csrc/muladd.cpp>`_
+   for more details:
+
+.. code-block:: cpp
+  
+  #include <Python.h>
+
+  extern "C" {
+    /* Creates a dummy empty _C module that can be imported from Python.
+      The import from Python will load the .so consisting of this file
+      in this extension, so that the TORCH_LIBRARY static initializers
+      below are run. */
+    PyObject* PyInit__C(void)
+    {
+        static struct PyModuleDef module_def = {
+            PyModuleDef_HEAD_INIT,
+            "_C",   /* name of module */
+            NULL,   /* module documentation, may be NULL */
+            -1,     /* size of per-interpreter state of the module,
+                      or -1 if the module keeps state in global variables. */
+            NULL,   /* methods */
+        };
+        return PyModule_Create(&module_def);
+    }
+  }
+
+.. code-block:: python
+
+  # in, say, extension/__init__.py
+  from . import _C
+
+3. If you want to avoid ``Python.h`` entirely in your C++ custom operator, you may
+   use ``torch.ops.load_library("/path/to/library.so")`` in Python to load the ``.so``
+   file(s) compiled from the extension. Note that, with this method, there is no ``_C``
+   Python module created for the extension so you cannot call ``import _C`` from Python.
+   Instead of relying on the import statement to trigger the custom operators to be
+   registered, ``torch.ops.load_library("/path/to/library.so")`` will do the trick.
+   The challenge then is shifted towards understanding where the ``.so`` files are
+   located so that you can load them, which is not always trivial:
+
+.. code-block:: python
+
+  import torch
+  from pathlib import Path
+
+  so_files = list(Path(__file__).parent.glob("_C*.so"))
+  assert (
+      len(so_files) == 1
+  ), f"Expected one _C*.so file, found {len(so_files)}"
+  torch.ops.load_library(so_files[0])
+
+  from . import ops
+
+
+Adding training (autograd) support for an operator
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Use ``torch.library.register_autograd`` to add training support for an operator. Prefer
+this over directly using Python ``torch.autograd.Function`` or C++ ``torch::autograd::Function``;
+you must use those in a very specific way to avoid silent incorrectness (see
+`The Custom Operators Manual <https://pytorch.org/docs/main/notes/custom_operators.html>`_
+for more details).
+
+.. code-block:: python
+
+  def _backward(ctx, grad):
+      a, b = ctx.saved_tensors
+      grad_a, grad_b = None, None
+      if ctx.needs_input_grad[0]:
+          grad_a = grad * b
+      if ctx.needs_input_grad[1]:
+          grad_b = grad * a
+      return grad_a, grad_b, None
+
+  def _setup_context(ctx, inputs, output):
+      a, b, c = inputs
+      saved_a, saved_b = None, None
+      if ctx.needs_input_grad[0]:
+          saved_b = b
+      if ctx.needs_input_grad[1]:
+          saved_a = a
+      ctx.save_for_backward(saved_a, saved_b)
+
+  # This code adds training support for the operator. You must provide us
+  # the backward formula for the operator and a `setup_context` function
+  # to save values to be used in the backward.
+  torch.library.register_autograd(
+      "extension_cpp::mymuladd", _backward, setup_context=_setup_context)
+
+Note that the backward must be a composition of PyTorch-understood operators.
+If you wish to use another custom C++ or CUDA kernel in your backwards pass,
+it must be wrapped into a custom operator.
+
+If we had our own custom ``mymul`` kernel, we would need to wrap it into a
+custom operator and then call that from the backward:
+
+.. code-block:: cpp
+
+  // New! a mymul_cpu kernel
+  at::Tensor mymul_cpu(const at::Tensor& a, const at::Tensor& b) {
+    TORCH_CHECK(a.sizes() == b.sizes());
+    TORCH_CHECK(a.dtype() == at::kFloat);
+    TORCH_CHECK(b.dtype() == at::kFloat);
+    TORCH_CHECK(a.device().type() == at::DeviceType::CPU);
+    TORCH_CHECK(b.device().type() == at::DeviceType::CPU);
+    at::Tensor a_contig = a.contiguous();
+    at::Tensor b_contig = b.contiguous();
+    at::Tensor result = torch::empty(a_contig.sizes(), a_contig.options());
+    const float* a_ptr = a_contig.data_ptr<float>();
+    const float* b_ptr = b_contig.data_ptr<float>();
+    float* result_ptr = result.data_ptr<float>();
+    for (int64_t i = 0; i < result.numel(); i++) {
+      result_ptr[i] = a_ptr[i] * b_ptr[i];
+    }
+    return result;
+  }
+
+  TORCH_LIBRARY(extension_cpp, m) {
+    m.def("mymuladd(Tensor a, Tensor b, float c) -> Tensor");
+    // New! defining the mymul operator
+    m.def("mymul(Tensor a, Tensor b) -> Tensor");
+  }
+
+
+  TORCH_LIBRARY_IMPL(extension_cpp, CPU, m) {
+    m.impl("mymuladd", &mymuladd_cpu);
+    // New! registering the cpu kernel for the mymul operator
+    m.impl("mymul", &mymul_cpu);
+  }
+
+.. code-block:: python
+
+  def _backward(ctx, grad):
+      a, b = ctx.saved_tensors
+      grad_a, grad_b = None, None
+      if ctx.needs_input_grad[0]:
+          grad_a = torch.ops.extension_cpp.mymul.default(grad, b)
+      if ctx.needs_input_grad[1]:
+          grad_b = torch.ops.extension_cpp.mymul.default(grad, a)
+      return grad_a, grad_b, None
+
+
+  def _setup_context(ctx, inputs, output):
+      a, b, c = inputs
+      saved_a, saved_b = None, None
+      if ctx.needs_input_grad[0]:
+          saved_b = b
+      if ctx.needs_input_grad[1]:
+          saved_a = a
+      ctx.save_for_backward(saved_a, saved_b)
+
+
+  # This code adds training support for the operator. You must provide us
+  # the backward formula for the operator and a `setup_context` function
+  # to save values to be used in the backward.
+  torch.library.register_autograd(
+      "extension_cpp::mymuladd", _backward, setup_context=_setup_context)
+
+Testing an operator
+-------------------
+Use ``torch.library.opcheck`` to test that the custom op was registered correctly.
+Note that this function does not test that the gradients are mathematically correct
+-- plan to write separate tests for that, either manual ones or by using
+``torch.autograd.gradcheck``.
+
+.. code-block:: python
+
+  def sample_inputs(device, *, requires_grad=False):
+      def make_tensor(*size):
+          return torch.randn(size, device=device, requires_grad=requires_grad)
+
+      def make_nondiff_tensor(*size):
+          return torch.randn(size, device=device, requires_grad=False)
+
+      return [
+          [make_tensor(3), make_tensor(3), 1],
+          [make_tensor(20), make_tensor(20), 3.14],
+          [make_tensor(20), make_nondiff_tensor(20), -123],
+          [make_nondiff_tensor(2, 3), make_tensor(2, 3), -0.3],
+      ]
+
+  def reference_muladd(a, b, c):
+      return a * b + c
+
+  samples = sample_inputs(device, requires_grad=True)
+  samples.extend(sample_inputs(device, requires_grad=False))
+  for args in samples:
+      # Correctness test
+      result = torch.ops.extension_cpp.mymuladd(*args)
+      expected = reference_muladd(*args)
+      torch.testing.assert_close(result, expected)
+
+      # Use opcheck to check for incorrect usage of operator registration APIs
+      torch.library.opcheck(torch.ops.extension_cpp.mymuladd.default, args)
+
+.. _mutable-ops:
+
+Creating mutable operators
+--------------------------
+You may wish to author a custom operator that mutates its inputs. Use ``Tensor(a!)``
+to specify each mutable Tensor in the schema; otherwise, there will be undefined
+behavior. If there are multiple mutated Tensors, use different names (for example, ``Tensor(a!)``,
+``Tensor(b!)``, ``Tensor(c!)``) for each mutable Tensor.
+
+Let's author a ``myadd_out(a, b, out)`` operator, which writes the contents of ``a+b`` into ``out``.
+
+.. code-block:: cpp
+
+  // An example of an operator that mutates one of its inputs.
+  void myadd_out_cpu(const at::Tensor& a, const at::Tensor& b, at::Tensor& out) {
+    TORCH_CHECK(a.sizes() == b.sizes());
+    TORCH_CHECK(b.sizes() == out.sizes());
+    TORCH_CHECK(a.dtype() == at::kFloat);
+    TORCH_CHECK(b.dtype() == at::kFloat);
+    TORCH_CHECK(out.dtype() == at::kFloat);
+    TORCH_CHECK(out.is_contiguous());
+    TORCH_INTERNAL_ASSERT(a.device().type() == at::DeviceType::CPU);
+    TORCH_INTERNAL_ASSERT(b.device().type() == at::DeviceType::CPU);
+    TORCH_INTERNAL_ASSERT(out.device().type() == at::DeviceType::CPU);
+    at::Tensor a_contig = a.contiguous();
+    at::Tensor b_contig = b.contiguous();
+    const float* a_ptr = a_contig.data_ptr<float>();
+    const float* b_ptr = b_contig.data_ptr<float>();
+    float* result_ptr = out.data_ptr<float>();
+    for (int64_t i = 0; i < out.numel(); i++) {
+      result_ptr[i] = a_ptr[i] + b_ptr[i];
+    }
+  }
+
+When defining the operator, we must specify that it mutates the out Tensor in the schema:
+
+.. code-block:: cpp
+
+  TORCH_LIBRARY(extension_cpp, m) {
+    m.def("mymuladd(Tensor a, Tensor b, float c) -> Tensor");
+    m.def("mymul(Tensor a, Tensor b) -> Tensor");
+    // New!
+    m.def("myadd_out(Tensor a, Tensor b, Tensor(a!) out) -> ()");
+  }
+
+  TORCH_LIBRARY_IMPL(extension_cpp, CPU, m) {
+    m.impl("mymuladd", &mymuladd_cpu);
+    m.impl("mymul", &mymul_cpu);
+    // New!
+    m.impl("myadd_out", &myadd_out_cpu);
+  }
+
+.. note::
+
+  Do not return any mutated Tensors as outputs of the operator as this will
+  cause incompatibility with PyTorch subsystems like ``torch.compile``.
+
+Conclusion
+----------
+In this tutorial, we went over the recommended approach to integrating Custom C++
+and CUDA operators with PyTorch. The ``TORCH_LIBRARY/torch.library`` APIs are fairly
+low-level. For more information about how to use the API, see
+`The Custom Operators Manual <https://pytorch.org/tutorials/advanced/custom_ops_landing_page.html#the-custom-operators-manual>`_.
diff --git a/advanced_source/cpp_custom_ops_sycl.rst b/advanced_source/cpp_custom_ops_sycl.rst
new file mode 100644
index 00000000000..3b3ad069b58
--- /dev/null
+++ b/advanced_source/cpp_custom_ops_sycl.rst
@@ -0,0 +1,274 @@
+.. _cpp-custom-ops-tutorial-sycl:
+
+Custom SYCL Operators
+=====================
+
+.. grid:: 2
+
+    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn
+       :class-card: card-prerequisites
+
+       * How to integrate custom operators written in SYCL with PyTorch
+
+    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites
+       :class-card: card-prerequisites
+
+       * PyTorch 2.8 or later
+       * Basic understanding of SYCL programming
+
+.. note::
+
+  ``SYCL`` serves as the backend programming language for Intel GPUs (device label ``xpu``). For configuration details, see:
+  `Getting Started on Intel GPUs <https://docs.pytorch.org/docs/main/notes/get_start_xpu.html>`_. The Intel Compiler, which comes bundled with Intel Deep Learning Essentials, handles ``SYCL`` compilation. Ensure you install and activate the compiler environment prior to executing the code examples in this tutorial.
+
+PyTorch offers a large library of operators that work on Tensors (e.g. torch.add, torch.sum, etc).
+However, you may wish to bring a new custom operator to PyTorch. This tutorial demonstrates the
+best path to authoring a custom operator written in SYCL. Tutorials for C++ and CUDA operators are available in the :ref:`cpp-custom-ops-tutorial`.
+
+Follow the structure to create a custom SYCL operator:
+
+.. code-block:: text
+
+  sycl_example/
+  ├── setup.py
+  ├── sycl_extension
+  │   ├── __init__.py
+  │   ├── muladd.sycl
+  │   └── ops.py
+  └── test_sycl_extension.py
+
+Setting up the Build System
+---------------------------
+
+If you need to compile **SYCL** code (for example, ``.sycl`` files), use `torch.utils.cpp_extension.SyclExtension <https://docs.pytorch.org/docs/stable/cpp_extension.html#torch.utils.cpp_extension.SyclExtension>`_.
+The setup process is very similar to C++/CUDA, except the compilation arguments need to be adjusted for SYCL.
+
+Using ``sycl_extension`` is as straightforward as writing the following ``setup.py``:
+
+.. code-block:: python
+
+  import os
+  import torch
+  import glob
+  from setuptools import find_packages, setup
+  from torch.utils.cpp_extension import SyclExtension, BuildExtension
+
+  library_name = "sycl_extension"
+  py_limited_api = True
+  extra_compile_args = {
+      "cxx": ["-O3",
+              "-fdiagnostics-color=always",
+              "-DPy_LIMITED_API=0x03090000"],
+      "sycl": ["-O3" ]
+  }
+
+  assert(torch.xpu.is_available()), "XPU is not available, please check your environment"
+  # Source files collection
+  this_dir = os.path.dirname(os.path.curdir)
+  extensions_dir = os.path.join(this_dir, library_name)
+  sources = list(glob.glob(os.path.join(extensions_dir, "*.sycl")))
+  # Construct extension
+  ext_modules = [
+      SyclExtension(
+          f"{library_name}._C",
+          sources,
+          extra_compile_args=extra_compile_args,
+          py_limited_api=py_limited_api,
+      )
+  ]
+  setup(
+      name=library_name,
+      packages=find_packages(),
+      ext_modules=ext_modules,
+      install_requires=["torch"],
+      description="Simple Example of PyTorch Sycl extensions",
+      cmdclass={"build_ext": BuildExtension},
+      options={"bdist_wheel": {"py_limited_api": "cp39"}} if py_limited_api else {},
+  )
+
+
+Defining the custom op and adding backend implementations
+---------------------------------------------------------
+First, let's write a SYCL function that computes ``mymuladd``:
+
+In order to use this from PyTorch’s Python frontend, we need to register it
+as a PyTorch operator using the ``TORCH_LIBRARY`` API. This will automatically
+bind the operator to Python.
+
+
+If you also have a SYCL implementation of ``myaddmul``, you can also register it
+in a separate ``TORCH_LIBRARY_IMPL`` block:
+
+.. code-block:: cpp
+
+  #include <c10/xpu/XPUStream.h>
+  #include <sycl/sycl.hpp>
+  #include <ATen/Operators.h>
+  #include <torch/all.h>
+  #include <torch/library.h>
+
+  namespace sycl_extension {
+  // MulAdd Kernel: result = a * b + c
+  static void muladd_kernel(
+      int numel, const float* a, const float* b, float c, float* result,
+      const sycl::nd_item<1>& item) {
+      int idx = item.get_global_id(0);
+      if (idx < numel) {
+          result[idx] = a[idx] * b[idx] + c;
+      }
+  }
+
+  class MulAddKernelFunctor {
+  public:
+      MulAddKernelFunctor(int _numel, const float* _a, const float* _b, float _c, float* _result)
+          : numel(_numel), a(_a), b(_b), c(_c), result(_result) {}
+      void operator()(const sycl::nd_item<1>& item) const {
+          muladd_kernel(numel, a, b, c, result, item);
+      }
+
+  private:
+      int numel;
+      const float* a;
+      const float* b;
+      float c;
+      float* result;
+  };
+
+  at::Tensor mymuladd_xpu(const at::Tensor& a, const at::Tensor& b, double c) {
+      TORCH_CHECK(a.sizes() == b.sizes(), "a and b must have the same shape");
+      TORCH_CHECK(a.dtype() == at::kFloat, "a must be a float tensor");
+      TORCH_CHECK(b.dtype() == at::kFloat, "b must be a float tensor");
+      TORCH_CHECK(a.device().is_xpu(), "a must be an XPU tensor");
+      TORCH_CHECK(b.device().is_xpu(), "b must be an XPU tensor");
+
+      at::Tensor a_contig = a.contiguous();
+      at::Tensor b_contig = b.contiguous();
+      at::Tensor result = at::empty_like(a_contig);
+
+      const float* a_ptr = a_contig.data_ptr<float>();
+      const float* b_ptr = b_contig.data_ptr<float>();
+      float* res_ptr = result.data_ptr<float>();
+      int numel = a_contig.numel();
+
+      sycl::queue& queue = c10::xpu::getCurrentXPUStream().queue();
+      constexpr int threads = 256;
+      int blocks = (numel + threads - 1) / threads;
+
+      queue.submit([&](sycl::handler& cgh) {
+          cgh.parallel_for<MulAddKernelFunctor>(
+              sycl::nd_range<1>(blocks * threads, threads),
+              MulAddKernelFunctor(numel, a_ptr, b_ptr, static_cast<float>(c), res_ptr)
+          );
+      });
+
+      return result;
+  }
+  // Defines the operators
+  TORCH_LIBRARY(sycl_extension, m) {
+    m.def("mymuladd(Tensor a, Tensor b, float c) -> Tensor");
+  }
+
+  // ==================================================
+  // Register SYCL Implementations to Torch Library
+  // ==================================================
+  TORCH_LIBRARY_IMPL(sycl_extension, XPU, m) {
+      m.impl("mymuladd", &mymuladd_xpu);
+  }
+
+  } // namespace sycl_extension
+
+
+
+Create a Python Interface
+-------------------------
+
+Create a Python interface for our operator in the ``sycl_extension/ops.py`` file:
+
+.. code-block:: python
+
+  import torch
+  from torch import Tensor
+  __all__ = ["mymuladd"]
+
+  def mymuladd(a: Tensor, b: Tensor, c: float) -> Tensor:
+      """Performs a * b + c in an efficient fused kernel"""
+      return torch.ops.sycl_extension.mymuladd.default(a, b, c)
+
+Initialize Package
+------------------
+
+Create ``sycl_extension/__init__.py`` file to make the package importable:
+
+.. code-block:: python
+
+  import ctypes
+  from pathlib import Path
+
+  import torch
+
+  current_dir = Path(__file__).parent.parent
+  build_dir = current_dir / "build"
+  so_files = list(build_dir.glob("**/*.so"))
+
+  assert len(so_files) == 1, f"Expected one _C*.so file, found {len(so_files)}"
+
+  with torch._ops.dl_open_guard():
+      loaded_lib = ctypes.CDLL(so_files[0])
+
+  from . import ops
+
+  __all__ = [
+      "loaded_lib",
+      "ops",
+  ]
+
+Testing SYCL extension operator
+-------------------
+
+Use simple test to verify that the operator works correctly.
+
+.. code-block:: python
+
+  import torch
+  from torch.testing._internal.common_utils import TestCase
+  import unittest
+  import sycl_extension
+
+  def reference_muladd(a, b, c):
+      return a * b + c
+
+  class TestMyMulAdd(TestCase):
+      def sample_inputs(self, device, *, requires_grad=False):
+          def make_tensor(*size):
+              return torch.randn(size, device=device, requires_grad=requires_grad)
+
+          def make_nondiff_tensor(*size):
+              return torch.randn(size, device=device, requires_grad=False)
+
+          return [
+              [make_tensor(3), make_tensor(3), 1],
+              [make_tensor(20), make_tensor(20), 3.14],
+              [make_tensor(20), make_nondiff_tensor(20), -123],
+              [make_nondiff_tensor(2, 3), make_tensor(2, 3), -0.3],
+          ]
+
+      def _test_correctness(self, device):
+          samples = self.sample_inputs(device)
+          for args in samples:
+              result = sycl_extension.ops.mymuladd(*args)
+              expected = reference_muladd(*args)
+              torch.testing.assert_close(result, expected)
+
+      @unittest.skipIf(not torch.xpu.is_available(), "requires Intel GPU")
+      def test_correctness_xpu(self):
+          self._test_correctness("xpu")
+
+  if __name__ == "__main__":
+      unittest.main()
+
+This test checks the correctness of the custom operator by comparing its output against a reference implementation.
+
+Conclusion
+----------
+
+In this tutorial, we demonstrated how to implement and compile custom SYCL operators for PyTorch. We specifically showcased an inference operation ``muladd``. For adding backward support or enabling torch.compile compatibility, please refer to :ref:`cpp-custom-ops-tutorial`.
diff --git a/advanced_source/cpp_export.rst b/advanced_source/cpp_export.rst
new file mode 100644
index 00000000000..56c4bcbaae7
--- /dev/null
+++ b/advanced_source/cpp_export.rst
@@ -0,0 +1,3 @@
+.. warning::
+    TorchScript is deprecated, please use 
+    `torch.export <https://docs.pytorch.org/tutorials/intermediate/torch_export_tutorial.html>`__ instead.
\ No newline at end of file
diff --git a/advanced_source/cpp_frontend.rst b/advanced_source/cpp_frontend.rst
new file mode 100644
index 00000000000..968afa01b23
--- /dev/null
+++ b/advanced_source/cpp_frontend.rst
@@ -0,0 +1,1325 @@
+.. _cpp-frontend-tutorial:
+
+Using the PyTorch C++ Frontend
+==============================
+
+**Author:** `Peter Goldsborough <https://github.com/goldsborough>`_
+
+.. grid:: 2
+
+    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn
+       :class-card: card-prerequisites
+
+       * How to build a C++ application that utilizes the PyTorch C++ frontend
+       * How to define and train neural networks from C++ using PyTorch abstractions
+
+    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites
+       :class-card: card-prerequisites
+
+       * PyTorch 1.5 or later
+       * Basic understanding of C++ programming
+       * Basic Ubuntu Linux environment with CMake >= 3.5; similar commands will work in a MacOS / Windows environment
+       * (Optional) A CUDA-based GPU for the GPU training sections
+
+The PyTorch C++ frontend is a pure C++ interface to the PyTorch machine learning
+framework. While the primary interface to PyTorch naturally is Python, this
+Python API sits atop a substantial C++ codebase providing foundational data
+structures and functionality such as tensors and automatic differentiation. The
+C++ frontend exposes a pure C++17 API that extends this underlying C++ codebase
+with tools required for machine learning training and inference. This includes a
+built-in collection of common components for neural network modeling; an API to
+extend this collection with custom modules; a library of popular optimization
+algorithms such as stochastic gradient descent; a parallel data loader with an
+API to define and load datasets; serialization routines and more.
+
+This tutorial will walk you through an end-to-end example of training a model
+with the C++ frontend. Concretely, we will be training a `DCGAN
+<https://arxiv.org/abs/1511.06434>`_ -- a kind of generative model -- to
+generate images of MNIST digits. While conceptually a simple example, it should
+be enough to give you a whirlwind overview of the PyTorch C++ frontend and wet
+your appetite for training more complex models. We will begin with some
+motivating words for why you would want to use the C++ frontend to begin with,
+and then dive straight into defining and training our model.
+
+.. tip::
+
+  Watch `this lightning talk from CppCon 2018
+  <https://www.youtube.com/watch?v=auRPXMMHJzc>`_ for a quick (and humorous)
+  presentation on the C++ frontend.
+
+.. tip::
+
+  `This note <https://pytorch.org/cppdocs/frontend.html>`_ provides a sweeping
+  overview of the C++ frontend's components and design philosophy.
+
+.. tip::
+
+  Documentation for the PyTorch C++ ecosystem is available at
+  https://pytorch.org/cppdocs. There you can find high level descriptions as
+  well as API-level documentation.
+
+Motivation
+----------
+
+Before we embark on our exciting journey of GANs and MNIST digits, let's take a
+step back and discuss why you would want to use the C++ frontend instead of the
+Python one to begin with. We (the PyTorch team) created the C++ frontend to
+enable research in environments in which Python cannot be used, or is simply not
+the right tool for the job. Examples for such environments include:
+
+- **Low Latency Systems**: You may want to do reinforcement learning research in
+  a pure C++ game engine with high frames-per-second and low latency
+  requirements. Using a pure C++ library is a much better fit to such an
+  environment than a Python library. Python may not be tractable at all because
+  of the slowness of the Python interpreter.
+- **Highly Multithreaded Environments**: Due to the Global Interpreter Lock
+  (GIL), Python cannot run more than one system thread at a time.
+  Multiprocessing is an alternative, but not as scalable and has significant
+  shortcomings. C++ has no such constraints and threads are easy to use and
+  create. Models requiring heavy parallelization, like those used in `Deep
+  Neuroevolution <https://www.uber.com/blog/deep-neuroevolution/>`_, can benefit from
+  this.
+- **Existing C++ Codebases**: You may be the owner of an existing C++
+  application doing anything from serving web pages in a backend server to
+  rendering 3D graphics in photo editing software, and wish to integrate
+  machine learning methods into your system. The C++ frontend allows you to
+  remain in C++ and spare yourself the hassle of binding back and forth between
+  Python and C++, while retaining much of the flexibility and intuitiveness of
+  the traditional PyTorch (Python) experience.
+
+The C++ frontend is not intended to compete with the Python frontend. It is
+meant to complement it. We know researchers and engineers alike love PyTorch for
+its simplicity, flexibility and intuitive API. Our goal is to make sure you can
+take advantage of these core design principles in every possible environment,
+including the ones described above. If one of these scenarios describes your use
+case well, or if you are simply interested or curious, follow along as we
+explore the C++ frontend in detail in the following paragraphs.
+
+.. tip::
+
+	The C++ frontend tries to provide an API as close as possible to that of the
+	Python frontend. If you are experienced with the Python frontend and ever ask
+	yourself "how do I do X with the C++ frontend?", write your code the way you
+	would in Python, and more often than not the same functions and methods will
+	be available in C++ as in Python (just remember to replace dots with double
+	colons).
+
+Writing a Basic Application
+---------------------------
+
+Let's begin by writing a minimal C++ application to verify that we're on the
+same page regarding our setup and build environment. First, you will need to
+grab a copy of the *LibTorch* distribution -- our ready-built zip archive that
+packages all relevant headers, libraries and CMake build files required to use
+the C++ frontend. The LibTorch distribution is available for download on the
+`PyTorch website <https://pytorch.org/get-started/locally/>`_ for Linux, MacOS
+and Windows. The rest of this tutorial will assume a basic Ubuntu Linux
+environment, however you are free to follow along on MacOS or Windows too.
+
+.. tip::
+
+  The note on `Installing C++ Distributions of PyTorch
+  <https://pytorch.org/cppdocs/installing.html>`_ describes the following steps
+  in more detail.
+
+.. tip::
+  On Windows, debug and release builds are not ABI-compatible. If you plan to
+  build your project in debug mode, please try the debug version of LibTorch.
+  Also, make sure you specify the correct configuration in the ``cmake --build .``
+  line below.
+
+The first step is to download the LibTorch distribution locally, via the link
+retrieved from the PyTorch website. For a vanilla Ubuntu Linux environment, this
+means running:
+
+.. code-block:: shell
+
+  # If you need e.g. CUDA 9.0 support, please replace "cpu" with "cu90" in the URL below.
+  wget https://download.pytorch.org/libtorch/nightly/cpu/libtorch-shared-with-deps-latest.zip
+  unzip libtorch-shared-with-deps-latest.zip
+
+Next, let's write a tiny C++ file called ``dcgan.cpp`` that includes
+``torch/torch.h`` and for now simply prints out a three by three identity
+matrix:
+
+.. code-block:: cpp
+
+  #include <torch/torch.h>
+  #include <iostream>
+
+  int main() {
+    torch::Tensor tensor = torch::eye(3);
+    std::cout << tensor << std::endl;
+  }
+
+To build this tiny application as well as our full-fledged training script later
+on we'll use this ``CMakeLists.txt`` file:
+
+.. code-block:: cmake
+
+  cmake_minimum_required(VERSION 3.5 FATAL_ERROR)
+  project(dcgan)
+
+  find_package(Torch REQUIRED)
+
+  add_executable(dcgan dcgan.cpp)
+  target_link_libraries(dcgan "${TORCH_LIBRARIES}")
+  set_property(TARGET dcgan PROPERTY CXX_STANDARD 17)
+
+.. note::
+
+  While CMake is the recommended build system for LibTorch, it is not a hard
+  requirement. You can also use Visual Studio project files, QMake, plain
+  Makefiles or any other build environment you feel comfortable with. However,
+  we do not provide out-of-the-box support for this.
+
+Make note of line 4 in the above CMake file: ``find_package(Torch REQUIRED)``.
+This instructs CMake to find the build configuration for the LibTorch library.
+In order for CMake to know *where* to find these files, we must set the
+``CMAKE_PREFIX_PATH`` when invoking ``cmake``. Before we do this, let's agree on
+the following directory structure for our ``dcgan`` application:
+
+.. code-block:: shell
+
+  dcgan/
+    CMakeLists.txt
+    dcgan.cpp
+
+Further, I will refer to the path to the unzipped LibTorch distribution as
+``/path/to/libtorch``. Note that this **must be an absolute path**. In
+particular, setting ``CMAKE_PREFIX_PATH`` to something like ``../../libtorch``
+will break in unexpected ways. Instead, write ``$PWD/../../libtorch`` to get the
+corresponding absolute path. Now, we are ready to build our application:
+
+.. code-block:: shell
+
+  root@fa350df05ecf:/home# mkdir build
+  root@fa350df05ecf:/home# cd build
+  root@fa350df05ecf:/home/build# cmake -DCMAKE_PREFIX_PATH=/path/to/libtorch ..
+  -- The C compiler identification is GNU 5.4.0
+  -- The CXX compiler identification is GNU 5.4.0
+  -- Check for working C compiler: /usr/bin/cc
+  -- Check for working C compiler: /usr/bin/cc -- works
+  -- Detecting C compiler ABI info
+  -- Detecting C compiler ABI info - done
+  -- Detecting C compile features
+  -- Detecting C compile features - done
+  -- Check for working CXX compiler: /usr/bin/c++
+  -- Check for working CXX compiler: /usr/bin/c++ -- works
+  -- Detecting CXX compiler ABI info
+  -- Detecting CXX compiler ABI info - done
+  -- Detecting CXX compile features
+  -- Detecting CXX compile features - done
+  -- Looking for pthread.h
+  -- Looking for pthread.h - found
+  -- Looking for pthread_create
+  -- Looking for pthread_create - not found
+  -- Looking for pthread_create in pthreads
+  -- Looking for pthread_create in pthreads - not found
+  -- Looking for pthread_create in pthread
+  -- Looking for pthread_create in pthread - found
+  -- Found Threads: TRUE
+  -- Found torch: /path/to/libtorch/lib/libtorch.so
+  -- Configuring done
+  -- Generating done
+  -- Build files have been written to: /home/build
+  root@fa350df05ecf:/home/build# cmake --build . --config Release
+  Scanning dependencies of target dcgan
+  [ 50%] Building CXX object CMakeFiles/dcgan.dir/dcgan.cpp.o
+  [100%] Linking CXX executable dcgan
+  [100%] Built target dcgan
+
+Above, we first created a ``build`` folder inside of our ``dcgan`` directory,
+entered this folder, ran the ``cmake`` command to generate the necessary build
+(Make) files and finally compiled the project successfully by running ``cmake
+--build . --config Release``. We are now all set to execute our minimal binary
+and complete this section on basic project configuration:
+
+.. code-block:: shell
+
+  root@fa350df05ecf:/home/build# ./dcgan
+  1  0  0
+  0  1  0
+  0  0  1
+  [ Variable[CPUFloatType]{3,3} ]
+
+Looks like an identity matrix to me!
+
+Defining the Neural Network Models
+----------------------------------
+
+Now that we have our basic environment configured, we can dive into the much
+more interesting parts of this tutorial. First, we will discuss how to define
+and interact with modules in the C++ frontend. We'll begin with basic,
+small-scale example modules and then implement a full-fledged GAN using the
+extensive library of built-in modules provided by the C++ frontend.
+
+Module API Basics
+^^^^^^^^^^^^^^^^^
+
+In line with the Python interface, neural networks based on the C++ frontend are
+composed of reusable building blocks called *modules*. There is a base module
+class from which all other modules are derived. In Python, this class is
+``torch.nn.Module`` and in C++ it is ``torch::nn::Module``. Besides a
+``forward()`` method that implements the algorithm the module encapsulates, a
+module usually contains any of three kinds of sub-objects: parameters, buffers
+and submodules.
+
+Parameters and buffers store state in form of tensors. Parameters record
+gradients, while buffers do not. Parameters are usually the trainable weights of
+your neural network. Examples of buffers include means and variances for batch
+normalization. In order to re-use particular blocks of logic and state, the
+PyTorch API allows modules to be nested. A nested module is termed a
+*submodule*.
+
+Parameters, buffers and submodules must be explicitly registered. Once
+registered, methods like ``parameters()`` or ``buffers()`` can be used to
+retrieve a container of all parameters in the entire (nested) module hierarchy.
+Similarly, methods like ``to(...)``, where e.g. ``to(torch::kCUDA)`` moves all
+parameters and buffers from CPU to CUDA memory, work on the entire module
+hierarchy.
+
+Defining a Module and Registering Parameters
+********************************************
+
+To put these words into code, let's consider this simple module written in the
+Python interface:
+
+.. code-block:: python
+
+  import torch
+
+  class Net(torch.nn.Module):
+    def __init__(self, N, M):
+      super(Net, self).__init__()
+      self.W = torch.nn.Parameter(torch.randn(N, M))
+      self.b = torch.nn.Parameter(torch.randn(M))
+
+    def forward(self, input):
+      return torch.addmm(self.b, input, self.W)
+
+
+In C++, it would look like this:
+
+.. code-block:: cpp
+
+  #include <torch/torch.h>
+
+  struct Net : torch::nn::Module {
+    Net(int64_t N, int64_t M) {
+      W = register_parameter("W", torch::randn({N, M}));
+      b = register_parameter("b", torch::randn(M));
+    }
+    torch::Tensor forward(torch::Tensor input) {
+      return torch::addmm(b, input, W);
+    }
+    torch::Tensor W, b;
+  };
+
+Just like in Python, we define a class called ``Net`` (for simplicity here a
+``struct`` instead of a ``class``) and derive it from the module base class.
+Inside the constructor, we create tensors using ``torch::randn`` just like we
+use ``torch.randn`` in Python. One interesting difference is how we register the
+parameters. In Python, we wrap the tensors with the ``torch.nn.Parameter``
+class, while in C++ we have to pass the tensor through the
+``register_parameter`` method instead. The reason for this is that the Python
+API can detect that an attribute is of type ``torch.nn.Parameter`` and
+automatically registers such tensors. In C++, reflection is very limited, so a
+more traditional (and less magical) approach is provided.
+
+Registering Submodules and Traversing the Module Hierarchy
+**********************************************************
+
+In the same way we can register parameters, we can also register submodules. In
+Python, submodules are automatically detected and registered when they are
+assigned as an attribute of a module:
+
+.. code-block:: python
+
+  class Net(torch.nn.Module):
+    def __init__(self, N, M):
+        super(Net, self).__init__()
+        # Registered as a submodule behind the scenes
+        self.linear = torch.nn.Linear(N, M)
+        self.another_bias = torch.nn.Parameter(torch.rand(M))
+
+    def forward(self, input):
+      return self.linear(input) + self.another_bias
+
+This allows, for example, to use the ``parameters()`` method to recursively
+access all parameters in our module hierarchy:
+
+.. code-block:: python
+
+  >>> net = Net(4, 5)
+  >>> print(list(net.parameters()))
+  [Parameter containing:
+  tensor([0.0808, 0.8613, 0.2017, 0.5206, 0.5353], requires_grad=True), Parameter containing:
+  tensor([[-0.3740, -0.0976, -0.4786, -0.4928],
+          [-0.1434,  0.4713,  0.1735, -0.3293],
+          [-0.3467, -0.3858,  0.1980,  0.1986],
+          [-0.1975,  0.4278, -0.1831, -0.2709],
+          [ 0.3730,  0.4307,  0.3236, -0.0629]], requires_grad=True), Parameter containing:
+  tensor([ 0.2038,  0.4638, -0.2023,  0.1230, -0.0516], requires_grad=True)]
+
+To register submodules in C++, use the aptly named ``register_module()`` method
+to register a module like ``torch::nn::Linear``:
+
+.. code-block:: cpp
+
+  struct Net : torch::nn::Module {
+    Net(int64_t N, int64_t M)
+        : linear(register_module("linear", torch::nn::Linear(N, M))) {
+      another_bias = register_parameter("b", torch::randn(M));
+    }
+    torch::Tensor forward(torch::Tensor input) {
+      return linear(input) + another_bias;
+    }
+    torch::nn::Linear linear;
+    torch::Tensor another_bias;
+  };
+
+.. tip::
+
+  You can find the full list of available built-in modules like
+  ``torch::nn::Linear``, ``torch::nn::Dropout`` or ``torch::nn::Conv2d`` in the
+  documentation of the ``torch::nn`` namespace `here
+  <https://pytorch.org/cppdocs/api/namespace_torch__nn.html>`_.
+
+One subtlety about the above code is why the submodule was created in the
+constructor's initializer list, while the parameter was created inside the
+constructor body. There is a good reason for this, which we'll touch upon this
+in the section on the C++ frontend's *ownership model* further below. The end
+result, however, is that we can recursively access our module tree's parameters
+just like in Python. Calling ``parameters()`` returns a
+``std::vector<torch::Tensor>``, which we can iterate over:
+
+.. code-block:: cpp
+
+  int main() {
+    Net net(4, 5);
+    for (const auto& p : net.parameters()) {
+      std::cout << p << std::endl;
+    }
+  }
+
+which prints:
+
+.. code-block:: shell
+
+  root@fa350df05ecf:/home/build# ./dcgan
+  0.0345
+  1.4456
+  -0.6313
+  -0.3585
+  -0.4008
+  [ Variable[CPUFloatType]{5} ]
+  -0.1647  0.2891  0.0527 -0.0354
+  0.3084  0.2025  0.0343  0.1824
+  -0.4630 -0.2862  0.2500 -0.0420
+  0.3679 -0.1482 -0.0460  0.1967
+  0.2132 -0.1992  0.4257  0.0739
+  [ Variable[CPUFloatType]{5,4} ]
+  0.01 *
+  3.6861
+  -10.1166
+  -45.0333
+  7.9983
+  -20.0705
+  [ Variable[CPUFloatType]{5} ]
+
+with three parameters just like in Python. To also see the names of these
+parameters, the C++ API provides a ``named_parameters()`` method which returns
+an ``OrderedDict`` just like in Python:
+
+.. code-block:: cpp
+
+  Net net(4, 5);
+  for (const auto& pair : net.named_parameters()) {
+    std::cout << pair.key() << ": " << pair.value() << std::endl;
+  }
+
+which we can execute again to see the output:
+
+.. code-block:: shell
+
+  root@fa350df05ecf:/home/build# make && ./dcgan                                                                                                                                            11:13:48
+  Scanning dependencies of target dcgan
+  [ 50%] Building CXX object CMakeFiles/dcgan.dir/dcgan.cpp.o
+  [100%] Linking CXX executable dcgan
+  [100%] Built target dcgan
+  b: -0.1863
+  -0.8611
+  -0.1228
+  1.3269
+  0.9858
+  [ Variable[CPUFloatType]{5} ]
+  linear.weight:  0.0339  0.2484  0.2035 -0.2103
+  -0.0715 -0.2975 -0.4350 -0.1878
+  -0.3616  0.1050 -0.4982  0.0335
+  -0.1605  0.4963  0.4099 -0.2883
+  0.1818 -0.3447 -0.1501 -0.0215
+  [ Variable[CPUFloatType]{5,4} ]
+  linear.bias: -0.0250
+  0.0408
+  0.3756
+  -0.2149
+  -0.3636
+  [ Variable[CPUFloatType]{5} ]
+
+.. note::
+
+  `The documentation
+  <https://pytorch.org/cppdocs/api/classtorch_1_1nn_1_1_module.html#exhale-class-classtorch-1-1nn-1-1-module>`_
+  for ``torch::nn::Module`` contains the full list of methods that operate on
+  the module hierarchy.
+
+Running the Network in Forward Mode
+***********************************
+
+To execute the network in C++, we simply call the ``forward()`` method we
+defined ourselves:
+
+.. code-block:: cpp
+
+  int main() {
+    Net net(4, 5);
+    std::cout << net.forward(torch::ones({2, 4})) << std::endl;
+  }
+
+which prints something like:
+
+.. code-block:: shell
+
+  root@fa350df05ecf:/home/build# ./dcgan
+  0.8559  1.1572  2.1069 -0.1247  0.8060
+  0.8559  1.1572  2.1069 -0.1247  0.8060
+  [ Variable[CPUFloatType]{2,5} ]
+
+Module Ownership
+****************
+
+At this point, we know how to define a module in C++, register parameters,
+register submodules, traverse the module hierarchy via methods like
+``parameters()`` and finally run the module's ``forward()`` method. While there
+are many more methods, classes and topics to devour in the C++ API, I will refer
+you to `docs <https://pytorch.org/cppdocs/api/namespace_torch__nn.html>`_ for
+the full menu. We'll also touch upon some more concepts as we implement the
+DCGAN model and end-to-end training pipeline in just a second. Before we do so,
+let me briefly touch upon the *ownership model* the C++ frontend provides for
+subclasses of ``torch::nn::Module``.
+
+For this discussion, the ownership model refers to the way modules are stored
+and passed around -- which determines who or what *owns* a particular module
+instance. In Python, objects are always allocated dynamically (on the heap) and
+have reference semantics. This is very easy to work with and straightforward to
+understand. In fact, in Python, you can largely forget about where objects live
+and how they get referenced, and focus on getting things done.
+
+C++, being a lower level language, provides more options in this realm. This
+increases complexity and heavily influences the design and ergonomics of the C++
+frontend. In particular, for modules in the C++ frontend, we have the option of
+using *either* value semantics *or* reference semantics. The first case is the
+simplest and was shown in the examples thus far: module objects are allocated on
+the stack and when passed to a function, can be either copied, moved (with
+``std::move``) or taken by reference or by pointer:
+
+.. code-block:: cpp
+
+  struct Net : torch::nn::Module { };
+
+  void a(Net net) { }
+  void b(Net& net) { }
+  void c(Net* net) { }
+
+  int main() {
+    Net net;
+    a(net);
+    a(std::move(net));
+    b(net);
+    c(&net);
+  }
+
+For the second case -- reference semantics -- we can use ``std::shared_ptr``.
+The advantage of reference semantics is that, like in Python, it reduces the
+cognitive overhead of thinking about how modules must be passed to functions and
+how arguments must be declared (assuming you use ``shared_ptr`` everywhere).
+
+.. code-block:: cpp
+
+  struct Net : torch::nn::Module {};
+
+  void a(std::shared_ptr<Net> net) { }
+
+  int main() {
+    auto net = std::make_shared<Net>();
+    a(net);
+  }
+
+In our experience, researchers coming from dynamic languages greatly prefer
+reference semantics over value semantics, even though the latter is more
+"native" to C++. It is also important to note that ``torch::nn::Module``'s
+design, in order to stay close to the ergonomics of the Python API, relies on
+shared ownership. For example, take our earlier (here shortened) definition of
+``Net``:
+
+.. code-block:: cpp
+
+  struct Net : torch::nn::Module {
+    Net(int64_t N, int64_t M)
+      : linear(register_module("linear", torch::nn::Linear(N, M)))
+    { }
+    torch::nn::Linear linear;
+  };
+
+In order to use the ``linear`` submodule, we want to store it directly in our
+class. However, we also want the module base class to know about and have access
+to this submodule. For this, it must store a reference to this submodule. At
+this point, we have already arrived at the need for shared ownership. Both the
+``torch::nn::Module`` class and concrete ``Net`` class require a reference to
+the submodule. For this reason, the base class stores modules as
+``shared_ptr``\s, and therefore the concrete class must too.
+
+But wait! I don't see any mention of ``shared_ptr`` in the above code! Why is
+that? Well, because ``std::shared_ptr<MyModule>`` is a hell of a lot to type. To
+keep our researchers productive, we came up with an elaborate scheme to hide the
+mention of ``shared_ptr`` -- a benefit usually reserved for value semantics --
+while retaining reference semantics. To understand how this works, we can take a
+look at a simplified definition of the ``torch::nn::Linear`` module in the core
+library (the full definition is `here
+<https://github.com/pytorch/pytorch/blob/master/torch/csrc/api/include/torch/nn/modules/linear.h>`_):
+
+.. code-block:: cpp
+
+  struct LinearImpl : torch::nn::Module {
+    LinearImpl(int64_t in, int64_t out);
+
+    Tensor forward(const Tensor& input);
+
+    Tensor weight, bias;
+  };
+
+  TORCH_MODULE(Linear);
+
+In brief: the module is not called ``Linear``, but ``LinearImpl``. A macro,
+``TORCH_MODULE`` then defines the actual ``Linear`` class. This "generated"
+class is effectively a wrapper over a ``std::shared_ptr<LinearImpl>``. It is a
+wrapper instead of a simple typedef so that, among other things, constructors
+still work as expected, i.e. you can still write ``torch::nn::Linear(3, 4)``
+instead of ``std::make_shared<LinearImpl>(3, 4)``. We call the class created by
+the macro the module *holder*. Like with (shared) pointers, you access the
+underlying object using the arrow operator (like ``model->forward(...)``). The
+end result is an ownership model that resembles that of the Python API quite
+closely. Reference semantics become the default, but without the extra typing of
+``std::shared_ptr`` or ``std::make_shared``. For our ``Net``, using the module
+holder API looks like this:
+
+.. code-block:: cpp
+
+  struct NetImpl : torch::nn::Module {};
+  TORCH_MODULE(Net);
+
+  void a(Net net) { }
+
+  int main() {
+    Net net;
+    a(net);
+  }
+
+There is one subtle issue that deserves mention here. A default constructed
+``std::shared_ptr`` is "empty", i.e. contains a null pointer. What is a default
+constructed ``Linear`` or ``Net``? Well, it's a tricky choice. We could say it
+should be an empty (null) ``std::shared_ptr<LinearImpl>``. However, recall that
+``Linear(3, 4)`` is the same as ``std::make_shared<LinearImpl>(3, 4)``. This
+means that if we had decided that ``Linear linear;`` should be a null pointer,
+then there would be no way to construct a module that does not take any
+constructor arguments, or defaults all of them. For this reason, in the current
+API, a default constructed module holder (like ``Linear()``) invokes the
+default constructor of the underlying module (``LinearImpl()``). If the
+underlying module does not have a default constructor, you get a compiler error.
+To instead construct the empty holder, you can pass ``nullptr`` to the
+constructor of the holder.
+
+In practice, this means you can use submodules either like shown earlier, where
+the module is registered and constructed in the *initializer list*:
+
+.. code-block:: cpp
+
+  struct Net : torch::nn::Module {
+    Net(int64_t N, int64_t M)
+      : linear(register_module("linear", torch::nn::Linear(N, M)))
+    { }
+    torch::nn::Linear linear;
+  };
+
+or you can first construct the holder with a null pointer and then assign to it
+in the constructor (more familiar for Pythonistas):
+
+.. code-block:: cpp
+
+  struct Net : torch::nn::Module {
+    Net(int64_t N, int64_t M) {
+      linear = register_module("linear", torch::nn::Linear(N, M));
+    }
+    torch::nn::Linear linear{nullptr}; // construct an empty holder
+  };
+
+In conclusion: Which ownership model -- which semantics -- should you use? The
+C++ frontend's API best supports the ownership model provided by module holders.
+The only disadvantage of this mechanism is one extra line of boilerplate below
+the module declaration. That said, the simplest model is still the value
+semantics model shown in the introduction to C++ modules. For small, simple
+scripts, you may get away with it too. But you'll find sooner or later that, for
+technical reasons, it is not always supported. For example, the serialization
+API (``torch::save`` and ``torch::load``) only supports module holders (or plain
+``shared_ptr``). As such, the module holder API is the recommended way of
+defining modules with the C++ frontend, and we will use this API in this
+tutorial henceforth.
+
+Defining the DCGAN Modules
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+We now have the necessary background and introduction to define the modules for
+the machine learning task we want to solve in this post. To recap: our task is
+to generate images of digits from the `MNIST dataset
+<https://huggingface.co/datasets/ylecun/mnist>`_. We want to use a `generative adversarial
+network (GAN)
+<https://papers.nips.cc/paper/5423-generative-adversarial-nets.pdf>`_ to solve
+this task. In particular, we'll use a `DCGAN architecture
+<https://arxiv.org/abs/1511.06434>`_ -- one of the first and simplest of its
+kind, but entirely sufficient for this task.
+
+.. tip::
+
+  You can find the full source code presented in this tutorial `in this
+  repository <https://github.com/pytorch/examples/tree/master/cpp/dcgan>`_.
+
+What was a GAN aGAN?
+********************
+
+A GAN consists of two distinct neural network models: a *generator* and a
+*discriminator*. The generator receives samples from a noise distribution, and
+its aim is to transform each noise sample into an image that resembles those of
+a target distribution -- in our case the MNIST dataset. The discriminator in
+turn receives either *real* images from the MNIST dataset, or *fake* images from
+the generator. It is asked to emit a probability judging how real (closer to
+``1``) or fake (closer to ``0``) a particular image is. Feedback from the
+discriminator on how real the images produced by the generator are is used to
+train the generator. Feedback on how good of an eye for authenticity the
+discriminator has is used to optimize the discriminator. In theory, a delicate
+balance between the generator and discriminator makes them improve in tandem,
+leading to the generator producing images indistinguishable from the target
+distribution, fooling the discriminator's (by then) excellent eye into emitting
+a probability of ``0.5`` for both real and fake images. For us, the end result
+is a machine that receives noise as input and generates realistic images of
+digits as its output.
+
+The Generator Module
+********************
+
+We begin by defining the generator module, which consists of a series of
+transposed 2D convolutions, batch normalizations and ReLU activation units.
+We explicitly pass inputs (in a functional way) between modules in the
+``forward()`` method of a module we define ourselves:
+
+.. code-block:: cpp
+
+  struct DCGANGeneratorImpl : nn::Module {
+    DCGANGeneratorImpl(int kNoiseSize)
+        : conv1(nn::ConvTranspose2dOptions(kNoiseSize, 256, 4)
+                    .bias(false)),
+          batch_norm1(256),
+          conv2(nn::ConvTranspose2dOptions(256, 128, 3)
+                    .stride(2)
+                    .padding(1)
+                    .bias(false)),
+          batch_norm2(128),
+          conv3(nn::ConvTranspose2dOptions(128, 64, 4)
+                    .stride(2)
+                    .padding(1)
+                    .bias(false)),
+          batch_norm3(64),
+          conv4(nn::ConvTranspose2dOptions(64, 1, 4)
+                    .stride(2)
+                    .padding(1)
+                    .bias(false))
+   {
+     // register_module() is needed if we want to use the parameters() method later on
+     register_module("conv1", conv1);
+     register_module("conv2", conv2);
+     register_module("conv3", conv3);
+     register_module("conv4", conv4);
+     register_module("batch_norm1", batch_norm1);
+     register_module("batch_norm2", batch_norm2);
+     register_module("batch_norm3", batch_norm3);
+   }
+
+   torch::Tensor forward(torch::Tensor x) {
+     x = torch::relu(batch_norm1(conv1(x)));
+     x = torch::relu(batch_norm2(conv2(x)));
+     x = torch::relu(batch_norm3(conv3(x)));
+     x = torch::tanh(conv4(x));
+     return x;
+   }
+
+   nn::ConvTranspose2d conv1, conv2, conv3, conv4;
+   nn::BatchNorm2d batch_norm1, batch_norm2, batch_norm3;
+  };
+  TORCH_MODULE(DCGANGenerator);
+
+  DCGANGenerator generator(kNoiseSize);
+
+We can now invoke ``forward()`` on the ``DCGANGenerator`` to map a noise sample to an image.
+
+The particular modules chosen, like ``nn::ConvTranspose2d`` and ``nn::BatchNorm2d``,
+follows the structure outlined earlier. The ``kNoiseSize`` constant determines
+the size of the input noise vector and is set to ``100``. Hyperparameters were,
+of course, found via grad student descent.
+
+.. attention::
+
+	No grad students were harmed in the discovery of hyperparameters. They were
+	fed Soylent regularly.
+
+.. note::
+
+	A brief word on the way options are passed to built-in modules like ``Conv2d``
+	in the C++ frontend: Every module has some required options, like the number
+	of features for ``BatchNorm2d``. If you only need to configure the required
+	options, you can pass them directly to the module's constructor, like
+	``BatchNorm2d(128)`` or ``Dropout(0.5)`` or ``Conv2d(8, 4, 2)`` (for input
+	channel count, output channel count, and kernel size). If, however, you need
+	to modify other options, which are normally defaulted, such as ``bias``
+	for ``Conv2d``, you need to construct and pass an *options* object. Every
+	module in the C++ frontend has an associated options struct, called
+	``ModuleOptions`` where ``Module`` is the name of the module, like
+	``LinearOptions`` for ``Linear``. This is what we do for the ``Conv2d``
+	modules above.
+
+The Discriminator Module
+************************
+
+The discriminator is similarly a sequence of convolutions, batch normalizations
+and activations. However, the convolutions are now regular ones instead of
+transposed, and we use a leaky ReLU with an alpha value of 0.2 instead of a
+vanilla ReLU. Also, the final activation becomes a Sigmoid, which squashes
+values into a range between 0 and 1. We can then interpret these squashed values
+as the probabilities the discriminator assigns to images being real.
+
+To build the discriminator, we will try something different: a `Sequential` module.
+Like in Python, PyTorch here provides two APIs for model definition: a functional one
+where inputs are passed through successive functions (e.g. the generator module example),
+and a more object-oriented one where we build a `Sequential` module containing the
+entire model as submodules. Using `Sequential`, the discriminator would look like:
+
+.. code-block:: cpp
+
+  nn::Sequential discriminator(
+    // Layer 1
+    nn::Conv2d(
+        nn::Conv2dOptions(1, 64, 4).stride(2).padding(1).bias(false)),
+    nn::LeakyReLU(nn::LeakyReLUOptions().negative_slope(0.2)),
+    // Layer 2
+    nn::Conv2d(
+        nn::Conv2dOptions(64, 128, 4).stride(2).padding(1).bias(false)),
+    nn::BatchNorm2d(128),
+    nn::LeakyReLU(nn::LeakyReLUOptions().negative_slope(0.2)),
+    // Layer 3
+    nn::Conv2d(
+        nn::Conv2dOptions(128, 256, 4).stride(2).padding(1).bias(false)),
+    nn::BatchNorm2d(256),
+    nn::LeakyReLU(nn::LeakyReLUOptions().negative_slope(0.2)),
+    // Layer 4
+    nn::Conv2d(
+        nn::Conv2dOptions(256, 1, 3).stride(1).padding(0).bias(false)),
+    nn::Sigmoid());
+
+.. tip::
+
+  A ``Sequential`` module simply performs function composition. The output of
+  the first submodule becomes the input of the second, the output of the third
+  becomes the input of the fourth and so on.
+
+
+Loading Data
+------------
+
+Now that we have defined the generator and discriminator model, we need some
+data we can train these models with. The C++ frontend, like the Python one,
+comes with a powerful parallel data loader. This data loader can read batches of
+data from a dataset (which you can define yourself) and provides many
+configuration knobs.
+
+.. note::
+
+	While the Python data loader uses multi-processing, the C++ data loader is truly
+	multi-threaded and does not launch any new processes.
+
+The data loader is part of the C++ frontend's ``data`` api, contained in the
+``torch::data::`` namespace. This API consists of a few different components:
+
+- The data loader class,
+- An API for defining datasets,
+- An API for defining *transforms*, which can be applied to datasets,
+- An API for defining *samplers*, which produce the indices with which datasets are indexed,
+- A library of existing datasets, transforms and samplers.
+
+For this tutorial, we can use the ``MNIST`` dataset that comes with the C++
+frontend. Let's instantiate a ``torch::data::datasets::MNIST`` for this, and
+apply two transformations: First, we normalize the images so that they are in
+the range of ``-1`` to ``+1`` (from an original range of ``0`` to ``1``).
+Second, we apply the ``Stack`` *collation*, which takes a batch of tensors and
+stacks them into a single tensor along the first dimension:
+
+.. code-block:: cpp
+
+  auto dataset = torch::data::datasets::MNIST("./mnist")
+      .map(torch::data::transforms::Normalize<>(0.5, 0.5))
+      .map(torch::data::transforms::Stack<>());
+
+Note that the MNIST dataset should be located in the ``./mnist`` directory
+relative to wherever you execute the training binary from. You can use `this
+script <https://gist.github.com/jbschlosser/94347505df6188f8764793ee29fd1bdd>`_
+to download the MNIST dataset.
+
+Next, we create a data loader and pass it this dataset. To make a new data
+loader, we use ``torch::data::make_data_loader``, which returns a
+``std::unique_ptr`` of the correct type (which depends on the type of the
+dataset, the type of the sampler and some other implementation details):
+
+.. code-block:: cpp
+
+  auto data_loader = torch::data::make_data_loader(std::move(dataset));
+
+The data loader does come with a lot of options. You can inspect the full set
+`here
+<https://github.com/pytorch/pytorch/blob/master/torch/csrc/api/include/torch/data/dataloader_options.h>`_.
+For example, to speed up the data loading, we can increase the number of
+workers. The default number is zero, which means the main thread will be used.
+If we set ``workers`` to ``2``, two threads will be spawned that load data
+concurrently. We should also increase the batch size from its default of ``1``
+to something more reasonable, like ``64`` (the value of ``kBatchSize``). So
+let's create a ``DataLoaderOptions`` object and set the appropriate properties:
+
+.. code-block:: cpp
+
+  auto data_loader = torch::data::make_data_loader(
+      std::move(dataset),
+      torch::data::DataLoaderOptions().batch_size(kBatchSize).workers(2));
+
+
+We can now write a loop to load batches of data, which we'll only print to the
+console for now:
+
+.. code-block:: cpp
+
+  for (torch::data::Example<>& batch : *data_loader) {
+    std::cout << "Batch size: " << batch.data.size(0) << " | Labels: ";
+    for (int64_t i = 0; i < batch.data.size(0); ++i) {
+      std::cout << batch.target[i].item<int64_t>() << " ";
+    }
+    std::cout << std::endl;
+  }
+
+The type returned by the data loader in this case is a ``torch::data::Example``.
+This type is a simple struct with a ``data`` field for the data and a ``target``
+field for the label. Because we applied the ``Stack`` collation earlier, the
+data loader returns only a single such example. If we had not applied the
+collation, the data loader would yield ``std::vector<torch::data::Example<>>``
+instead, with one element per example in the batch.
+
+If you rebuild and run this code, you should see something like this:
+
+.. code-block:: shell
+
+  root@fa350df05ecf:/home/build# make
+  Scanning dependencies of target dcgan
+  [ 50%] Building CXX object CMakeFiles/dcgan.dir/dcgan.cpp.o
+  [100%] Linking CXX executable dcgan
+  [100%] Built target dcgan
+  root@fa350df05ecf:/home/build# make
+  [100%] Built target dcgan
+  root@fa350df05ecf:/home/build# ./dcgan
+  Batch size: 64 | Labels: 5 2 6 7 2 1 6 7 0 1 6 2 3 6 9 1 8 4 0 6 5 3 3 0 4 6 6 6 4 0 8 6 0 6 9 2 4 0 2 8 6 3 3 2 9 2 0 1 4 2 3 4 8 2 9 9 3 5 8 0 0 7 9 9
+  Batch size: 64 | Labels: 2 2 4 7 1 2 8 8 6 9 0 2 2 9 3 6 1 3 8 0 4 4 8 8 8 9 2 6 4 7 1 5 0 9 7 5 4 3 5 4 1 2 8 0 7 1 9 6 1 6 5 3 4 4 1 2 3 2 3 5 0 1 6 2
+  Batch size: 64 | Labels: 4 5 4 2 1 4 8 3 8 3 6 1 5 4 3 6 2 2 5 1 3 1 5 0 8 2 1 5 3 2 4 4 5 9 7 2 8 9 2 0 6 7 4 3 8 3 5 8 8 3 0 5 8 0 8 7 8 5 5 6 1 7 8 0
+  Batch size: 64 | Labels: 3 3 7 1 4 1 6 1 0 3 6 4 0 2 5 4 0 4 2 8 1 9 6 5 1 6 3 2 8 9 2 3 8 7 4 5 9 6 0 8 3 0 0 6 4 8 2 5 4 1 8 3 7 8 0 0 8 9 6 7 2 1 4 7
+  Batch size: 64 | Labels: 3 0 5 5 9 8 3 9 8 9 5 9 5 0 4 1 2 7 7 2 0 0 5 4 8 7 7 6 1 0 7 9 3 0 6 3 2 6 2 7 6 3 3 4 0 5 8 8 9 1 9 2 1 9 4 4 9 2 4 6 2 9 4 0
+  Batch size: 64 | Labels: 9 6 7 5 3 5 9 0 8 6 6 7 8 2 1 9 8 8 1 1 8 2 0 7 1 4 1 6 7 5 1 7 7 4 0 3 2 9 0 6 6 3 4 4 8 1 2 8 6 9 2 0 3 1 2 8 5 6 4 8 5 8 6 2
+  Batch size: 64 | Labels: 9 3 0 3 6 5 1 8 6 0 1 9 9 1 6 1 7 7 4 4 4 7 8 8 6 7 8 2 6 0 4 6 8 2 5 3 9 8 4 0 9 9 3 7 0 5 8 2 4 5 6 2 8 2 5 3 7 1 9 1 8 2 2 7
+  Batch size: 64 | Labels: 9 1 9 2 7 2 6 0 8 6 8 7 7 4 8 6 1 1 6 8 5 7 9 1 3 2 0 5 1 7 3 1 6 1 0 8 6 0 8 1 0 5 4 9 3 8 5 8 4 8 0 1 2 6 2 4 2 7 7 3 7 4 5 3
+  Batch size: 64 | Labels: 8 8 3 1 8 6 4 2 9 5 8 0 2 8 6 6 7 0 9 8 3 8 7 1 6 6 2 7 7 4 5 5 2 1 7 9 5 4 9 1 0 3 1 9 3 9 8 8 5 3 7 5 3 6 8 9 4 2 0 1 2 5 4 7
+  Batch size: 64 | Labels: 9 2 7 0 8 4 4 2 7 5 0 0 6 2 0 5 9 5 9 8 8 9 3 5 7 5 4 7 3 0 5 7 6 5 7 1 6 2 8 7 6 3 2 6 5 6 1 2 7 7 0 0 5 9 0 0 9 1 7 8 3 2 9 4
+  Batch size: 64 | Labels: 7 6 5 7 7 5 2 2 4 9 9 4 8 7 4 8 9 4 5 7 1 2 6 9 8 5 1 2 3 6 7 8 1 1 3 9 8 7 9 5 0 8 5 1 8 7 2 6 5 1 2 0 9 7 4 0 9 0 4 6 0 0 8 6
+  ...
+
+Which means we are successfully able to load data from the MNIST dataset.
+
+Writing the Training Loop
+-------------------------
+
+Let's now finish the algorithmic part of our example and implement the delicate
+dance between the generator and discriminator. First, we'll create two
+optimizers, one for the generator and one for the discriminator. The optimizers
+we use implement the `Adam <https://arxiv.org/pdf/1412.6980.pdf>`_ algorithm:
+
+.. code-block:: cpp
+
+  torch::optim::Adam generator_optimizer(
+      generator->parameters(), torch::optim::AdamOptions(2e-4).betas(std::make_tuple(0.5, 0.5)));
+  torch::optim::Adam discriminator_optimizer(
+      discriminator->parameters(), torch::optim::AdamOptions(5e-4).betas(std::make_tuple(0.5, 0.5)));
+
+.. note::
+
+	As of this writing, the C++ frontend provides optimizers implementing Adagrad,
+	Adam, LBFGS, RMSprop and SGD. The `docs
+	<https://pytorch.org/cppdocs/api/namespace_torch__optim.html>`_ have the
+	up-to-date list.
+
+Next, we need to update our training loop. We'll add an outer loop to exhaust
+the data loader every epoch and then write the GAN training code:
+
+.. code-block:: cpp
+
+  for (int64_t epoch = 1; epoch <= kNumberOfEpochs; ++epoch) {
+    int64_t batch_index = 0;
+    for (torch::data::Example<>& batch : *data_loader) {
+      // Train discriminator with real images.
+      discriminator->zero_grad();
+      torch::Tensor real_images = batch.data;
+      torch::Tensor real_labels = torch::empty(batch.data.size(0)).uniform_(0.8, 1.0);
+      torch::Tensor real_output = discriminator->forward(real_images).reshape(real_labels.sizes());
+      torch::Tensor d_loss_real = torch::binary_cross_entropy(real_output, real_labels);
+      d_loss_real.backward();
+
+      // Train discriminator with fake images.
+      torch::Tensor noise = torch::randn({batch.data.size(0), kNoiseSize, 1, 1});
+      torch::Tensor fake_images = generator->forward(noise);
+      torch::Tensor fake_labels = torch::zeros(batch.data.size(0));
+      torch::Tensor fake_output = discriminator->forward(fake_images.detach()).reshape(fake_labels.sizes());
+      torch::Tensor d_loss_fake = torch::binary_cross_entropy(fake_output, fake_labels);
+      d_loss_fake.backward();
+
+      torch::Tensor d_loss = d_loss_real + d_loss_fake;
+      discriminator_optimizer.step();
+
+      // Train generator.
+      generator->zero_grad();
+      fake_labels.fill_(1);
+      fake_output = discriminator->forward(fake_images).reshape(fake_labels.sizes());
+      torch::Tensor g_loss = torch::binary_cross_entropy(fake_output, fake_labels);
+      g_loss.backward();
+      generator_optimizer.step();
+
+      std::printf(
+          "\r[%2ld/%2ld][%3ld/%3ld] D_loss: %.4f | G_loss: %.4f",
+          epoch,
+          kNumberOfEpochs,
+          ++batch_index,
+          batches_per_epoch,
+          d_loss.item<float>(),
+          g_loss.item<float>());
+    }
+  }
+
+Above, we first evaluate the discriminator on real images, for which it should
+assign a high probability. For this, we use
+``torch::empty(batch.data.size(0)).uniform_(0.8, 1.0)`` as the target
+probabilities.
+
+.. note::
+
+	We pick random values uniformly distributed between 0.8 and 1.0 instead of 1.0
+	everywhere in order to make the discriminator training more robust. This trick
+	is called *label smoothing*.
+
+Before evaluating the discriminator, we zero out the gradients of its
+parameters. After computing the loss, we back-propagate it through the network by
+calling ``d_loss.backward()`` to compute new gradients. We repeat this spiel for
+the fake images. Instead of using images from the dataset, we let the generator
+create fake images for this by feeding it a batch of random noise. We then
+forward those fake images to the discriminator. This time, we want the
+discriminator to emit low probabilities, ideally all zeros. Once we have
+computed the discriminator loss for both the batch of real and the batch of fake
+images, we can progress the discriminator's optimizer by one step in order to
+update its parameters.
+
+To train the generator, we again first zero its gradients, and then re-evaluate
+the discriminator on the fake images. However, this time we want the
+discriminator to assign probabilities very close to one, which would indicate
+that the generator can produce images that fool the discriminator into thinking
+they are actually real (from the dataset). For this, we fill the ``fake_labels``
+tensor with all ones. We finally step the generator's optimizer to also update
+its parameters.
+
+We should now be ready to train our model on the CPU. We don't have any code yet
+to capture state or sample outputs, but we'll add this in just a moment. For
+now, let's just observe that our model is doing *something* -- we'll later
+verify based on the generated images whether this something is meaningful.
+Re-building and running should print something like:
+
+.. code-block:: shell
+
+  root@3c0711f20896:/home/build# make && ./dcgan
+  Scanning dependencies of target dcgan
+  [ 50%] Building CXX object CMakeFiles/dcgan.dir/dcgan.cpp.o
+  [100%] Linking CXX executable dcgan
+  [100%] Built target dcga
+  [ 1/10][100/938] D_loss: 0.6876 | G_loss: 4.1304
+  [ 1/10][200/938] D_loss: 0.3776 | G_loss: 4.3101
+  [ 1/10][300/938] D_loss: 0.3652 | G_loss: 4.6626
+  [ 1/10][400/938] D_loss: 0.8057 | G_loss: 2.2795
+  [ 1/10][500/938] D_loss: 0.3531 | G_loss: 4.4452
+  [ 1/10][600/938] D_loss: 0.3501 | G_loss: 5.0811
+  [ 1/10][700/938] D_loss: 0.3581 | G_loss: 4.5623
+  [ 1/10][800/938] D_loss: 0.6423 | G_loss: 1.7385
+  [ 1/10][900/938] D_loss: 0.3592 | G_loss: 4.7333
+  [ 2/10][100/938] D_loss: 0.4660 | G_loss: 2.5242
+  [ 2/10][200/938] D_loss: 0.6364 | G_loss: 2.0886
+  [ 2/10][300/938] D_loss: 0.3717 | G_loss: 3.8103
+  [ 2/10][400/938] D_loss: 1.0201 | G_loss: 1.3544
+  [ 2/10][500/938] D_loss: 0.4522 | G_loss: 2.6545
+  ...
+
+Moving to the GPU
+-----------------
+
+While our current script can run just fine on the CPU, we all know convolutions
+are a lot faster on GPU. Let's quickly discuss how we can move our training onto
+the GPU. We'll need to do two things for this: pass a GPU device specification
+to tensors we allocate ourselves, and explicitly copy any other tensors onto the
+GPU via the ``to()`` method all tensors and modules in the C++ frontend have.
+The simplest way to achieve both is to create an instance of ``torch::Device``
+at the top level of our training script, and then pass that device to tensor
+factory functions like ``torch::zeros`` as well as the ``to()`` method. We can
+start by doing this with a CPU device:
+
+.. code-block:: cpp
+
+  // Place this somewhere at the top of your training script.
+  torch::Device device(torch::kCPU);
+
+New tensor allocations like
+
+.. code-block:: cpp
+
+  torch::Tensor fake_labels = torch::zeros(batch.data.size(0));
+
+should be updated to take the ``device`` as the last argument:
+
+.. code-block:: cpp
+
+  torch::Tensor fake_labels = torch::zeros(batch.data.size(0), device);
+
+For tensors whose creation is not in our hands, like those coming from the MNIST
+dataset, we must insert explicit ``to()`` calls. This means
+
+.. code-block:: cpp
+
+  torch::Tensor real_images = batch.data;
+
+becomes
+
+.. code-block:: cpp
+
+  torch::Tensor real_images = batch.data.to(device);
+
+and also our model parameters should be moved to the correct device:
+
+.. code-block:: cpp
+
+  generator->to(device);
+  discriminator->to(device);
+
+.. note::
+
+	If a tensor already lives on the device supplied to ``to()``, the call is a
+	no-op. No extra copy is made.
+
+At this point, we've just made our previous CPU-residing code more explicit.
+However, it is now also very easy to change the device to a CUDA device:
+
+.. code-block:: cpp
+
+  torch::Device device(torch::kCUDA)
+
+And now all tensors will live on the GPU, calling into fast CUDA kernels for all
+operations, without us having to change any downstream code. If we wanted to
+specify a particular device index, it could be passed as the second argument to
+the ``Device`` constructor. If we wanted different tensors to live on different
+devices, we could pass separate device instances (for example one on CUDA device
+0 and the other on CUDA device 1). We can even do this configuration
+dynamically, which is often useful to make our training scripts more portable:
+
+.. code-block:: cpp
+
+  torch::Device device = torch::kCPU;
+  if (torch::cuda::is_available()) {
+    std::cout << "CUDA is available! Training on GPU." << std::endl;
+    device = torch::kCUDA;
+  }
+
+or even
+
+.. code-block:: cpp
+
+  torch::Device device(torch::cuda::is_available() ? torch::kCUDA : torch::kCPU);
+
+Checkpointing and Recovering the Training State
+-----------------------------------------------
+
+The last augmentation we should make to our training script is to periodically
+save the state of our model parameters, the state of our optimizers as well as a
+few generated image samples. If our computer were to crash in the middle of the
+training procedure, the first two will allow us to restore the training state.
+For long-lasting training sessions, this is absolutely essential. Fortunately,
+the C++ frontend provides an API to serialize and deserialize both model and
+optimizer state, as well as individual tensors.
+
+The core API for this is ``torch::save(thing,filename)`` and
+``torch::load(thing,filename)``, where ``thing`` could be a
+``torch::nn::Module`` subclass or an optimizer instance like the ``Adam`` object
+we have in our training script. Let's update our training loop to checkpoint the
+model and optimizer state at a certain interval:
+
+.. code-block:: cpp
+
+  if (batch_index % kCheckpointEvery == 0) {
+    // Checkpoint the model and optimizer state.
+    torch::save(generator, "generator-checkpoint.pt");
+    torch::save(generator_optimizer, "generator-optimizer-checkpoint.pt");
+    torch::save(discriminator, "discriminator-checkpoint.pt");
+    torch::save(discriminator_optimizer, "discriminator-optimizer-checkpoint.pt");
+    // Sample the generator and save the images.
+    torch::Tensor samples = generator->forward(torch::randn({8, kNoiseSize, 1, 1}, device));
+    torch::save((samples + 1.0) / 2.0, torch::str("dcgan-sample-", checkpoint_counter, ".pt"));
+    std::cout << "\n-> checkpoint " << ++checkpoint_counter << '\n';
+  }
+
+where ``kCheckpointEvery`` is an integer set to something like ``100`` to
+checkpoint every ``100`` batches, and ``checkpoint_counter`` is a counter bumped
+every time we make a checkpoint.
+
+To restore the training state, you can add lines like these after all models and
+optimizers are created, but before the training loop:
+
+.. code-block:: cpp
+
+  torch::optim::Adam generator_optimizer(
+      generator->parameters(), torch::optim::AdamOptions(2e-4).beta1(0.5));
+  torch::optim::Adam discriminator_optimizer(
+      discriminator->parameters(), torch::optim::AdamOptions(2e-4).beta1(0.5));
+
+  if (kRestoreFromCheckpoint) {
+    torch::load(generator, "generator-checkpoint.pt");
+    torch::load(generator_optimizer, "generator-optimizer-checkpoint.pt");
+    torch::load(discriminator, "discriminator-checkpoint.pt");
+    torch::load(
+        discriminator_optimizer, "discriminator-optimizer-checkpoint.pt");
+  }
+
+  int64_t checkpoint_counter = 0;
+  for (int64_t epoch = 1; epoch <= kNumberOfEpochs; ++epoch) {
+    int64_t batch_index = 0;
+    for (torch::data::Example<>& batch : *data_loader) {
+
+
+Inspecting Generated Images
+---------------------------
+
+Our training script is now complete. We are ready to train our GAN, whether on
+CPU or GPU. To inspect the intermediary output of our training procedure, for
+which we added code to periodically save image samples to the
+``"dcgan-sample-xxx.pt"`` file, we can write a tiny Python script to load the
+tensors and display them with matplotlib:
+
+.. code-block:: python
+
+  import argparse
+
+  import matplotlib.pyplot as plt
+  import torch
+
+
+  parser = argparse.ArgumentParser()
+  parser.add_argument("-i", "--sample-file", required=True)
+  parser.add_argument("-o", "--out-file", default="out.png")
+  parser.add_argument("-d", "--dimension", type=int, default=3)
+  options = parser.parse_args()
+
+  module = torch.jit.load(options.sample_file)
+  images = list(module.parameters())[0]
+
+  for index in range(options.dimension * options.dimension):
+    image = images[index].detach().cpu().reshape(28, 28).mul(255).to(torch.uint8)
+    array = image.numpy()
+    axis = plt.subplot(options.dimension, options.dimension, 1 + index)
+    plt.imshow(array, cmap="gray")
+    axis.get_xaxis().set_visible(False)
+    axis.get_yaxis().set_visible(False)
+
+  plt.savefig(options.out_file)
+  print("Saved ", options.out_file)
+
+Let's now train our model for around 30 epochs:
+
+.. code-block:: shell
+
+  root@3c0711f20896:/home/build# make && ./dcgan                                                                                                                                10:17:57
+  Scanning dependencies of target dcgan
+  [ 50%] Building CXX object CMakeFiles/dcgan.dir/dcgan.cpp.o
+  [100%] Linking CXX executable dcgan
+  [100%] Built target dcgan
+  CUDA is available! Training on GPU.
+  [ 1/30][200/938] D_loss: 0.4953 | G_loss: 4.0195
+  -> checkpoint 1
+  [ 1/30][400/938] D_loss: 0.3610 | G_loss: 4.8148
+  -> checkpoint 2
+  [ 1/30][600/938] D_loss: 0.4072 | G_loss: 4.36760
+  -> checkpoint 3
+  [ 1/30][800/938] D_loss: 0.4444 | G_loss: 4.0250
+  -> checkpoint 4
+  [ 2/30][200/938] D_loss: 0.3761 | G_loss: 3.8790
+  -> checkpoint 5
+  [ 2/30][400/938] D_loss: 0.3977 | G_loss: 3.3315
+  ...
+  -> checkpoint 120
+  [30/30][938/938] D_loss: 0.3610 | G_loss: 3.8084
+
+And display the images in a plot:
+
+.. code-block:: shell
+
+  root@3c0711f20896:/home/build# python display.py -i dcgan-sample-100.pt
+  Saved out.png
+
+Which should look something like this:
+
+.. figure:: /_static/img/cpp-frontend/digits.png
+   :alt: digits
+
+Digits! Hooray! Now the ball is in your court: can you improve the model to make
+the digits look even better?
+
+Conclusion
+----------
+
+This tutorial has hopefully given you a digestible digest of the PyTorch C++
+frontend. A machine learning library like PyTorch by necessity has a very broad
+and extensive API. As such, there are many concepts we did not have time or
+space to discuss here. However, I encourage you to try out the API, and consult
+`our documentation <https://pytorch.org/cppdocs/>`_ and in particular the
+`Library API <https://pytorch.org/cppdocs/api/library_root.html>`_ section when
+you get stuck. Also, remember that you can expect the C++ frontend to follow the
+design and semantics of the Python frontend whenever we could make this
+possible, so you can leverage this fact to increase your learning rate.
+
+.. tip::
+
+  You can find the full source code presented in this tutorial `in this
+  repository <https://github.com/pytorch/examples/tree/master/cpp/dcgan>`_.
+
+As always, if you run into any problems or have questions, you can use our
+`forum <https://discuss.pytorch.org/>`_ or `GitHub issues
+<https://github.com/pytorch/pytorch/issues>`_ to get in touch.
diff --git a/advanced_source/custom_class_pt2.rst b/advanced_source/custom_class_pt2.rst
new file mode 100644
index 00000000000..229a94f2ce9
--- /dev/null
+++ b/advanced_source/custom_class_pt2.rst
@@ -0,0 +1,275 @@
+Supporting Custom C++ Classes in torch.compile/torch.export
+===========================================================
+
+
+This tutorial is a follow-on to the
+:doc:`custom C++ classes <custom_classes>` tutorial, and
+introduces additional steps that are needed to support custom C++ classes in
+torch.compile/torch.export.
+
+.. warning::
+
+    This feature is in prototype status and is subject to backwards compatibility
+    breaking changes. This tutorial provides a snapshot as of PyTorch 2.8. If
+    you run into any issues, please reach out to us on Github!
+
+Concretely, there are a few steps:
+
+1. Implement an ``__obj_flatten__`` method to the C++ custom class
+   implementation to allow us to inspect its states and guard the changes. The
+   method should return a tuple of tuple of attribute_name, value
+   (``tuple[tuple[str, value] * n]``).
+
+2. Register a python fake class using ``@torch._library.register_fake_class``
+
+    a. Implement “fake methods” of each of the class’s c++ methods, which should
+       have the same schema as the C++ implementation.
+
+    b. Additionally, implement an ``__obj_unflatten__`` classmethod in the Python
+       fake class to tell us how to create a fake class from the flattened
+       states returned by ``__obj_flatten__``.
+
+Here is a breakdown of the diff. Following the guide in
+:doc:`Extending TorchScript with Custom C++ Classes <custom_classes>`,
+we can create a thread-safe tensor queue and build it.
+
+.. code-block:: cpp
+
+    // Thread-safe Tensor Queue
+
+    #include <torch/custom_class.h>
+    #include <torch/script.h>
+
+    #include <iostream>
+    #include <string>
+    #include <vector>
+
+    using namespace torch::jit;
+
+    // Thread-safe Tensor Queue
+    struct TensorQueue : torch::CustomClassHolder {
+    explicit TensorQueue(at::Tensor t) : init_tensor_(t) {}
+
+    explicit TensorQueue(c10::Dict<std::string, at::Tensor> dict) {
+        init_tensor_ = dict.at(std::string("init_tensor"));
+        const std::string key = "queue";
+        at::Tensor size_tensor;
+        size_tensor = dict.at(std::string(key + "/size")).cpu();
+        const auto* size_tensor_acc = size_tensor.const_data_ptr<int64_t>();
+        int64_t queue_size = size_tensor_acc[0];
+
+        for (const auto index : c10::irange(queue_size)) {
+            at::Tensor val;
+            queue_[index] = dict.at(key + "/" + std::to_string(index));
+            queue_.push_back(val);
+        }
+    }
+
+    // Push the element to the rear of queue.
+    // Lock is added for thread safe.
+    void push(at::Tensor x) {
+        std::lock_guard<std::mutex> guard(mutex_);
+        queue_.push_back(x);
+    }
+    // Pop the front element of queue and return it.
+    // If empty, return init_tensor_.
+    // Lock is added for thread safe.
+    at::Tensor pop() {
+        std::lock_guard<std::mutex> guard(mutex_);
+        if (!queue_.empty()) {
+            auto val = queue_.front();
+            queue_.pop_front();
+            return val;
+        } else {
+            return init_tensor_;
+        }
+    }
+
+    std::vector<at::Tensor> get_raw_queue() {
+        std::vector<at::Tensor> raw_queue(queue_.begin(), queue_.end());
+        return raw_queue;
+    }
+
+    private:
+        std::deque<at::Tensor> queue_;
+        std::mutex mutex_;
+        at::Tensor init_tensor_;
+    };
+
+    // The torch binding code
+    TORCH_LIBRARY(MyCustomClass, m) {
+        m.class_<TensorQueue>("TensorQueue")
+            .def(torch::init<at::Tensor>())
+            .def("push", &TensorQueue::push)
+            .def("pop", &TensorQueue::pop)
+            .def("get_raw_queue", &TensorQueue::get_raw_queue);
+    }
+
+**Step 1**: Add an ``__obj_flatten__`` method to the C++ custom class implementation:
+
+.. code-block:: cpp
+
+    // Thread-safe Tensor Queue
+    struct TensorQueue : torch::CustomClassHolder {
+    ...
+    std::tuple<std::tuple<std::string, std::vector<at::Tensor>>, std::tuple<std::string, at::Tensor>> __obj_flatten__() {
+        return std::tuple(std::tuple("queue", this->get_raw_queue()), std::tuple("init_tensor_", this->init_tensor_.clone()));
+    }
+    ...
+    };
+
+    TORCH_LIBRARY(MyCustomClass, m) {
+        m.class_<TensorQueue>("TensorQueue")
+            .def(torch::init<at::Tensor>())
+            ...
+            .def("__obj_flatten__", &TensorQueue::__obj_flatten__);
+    }
+
+**Step 2a**: Register a fake class in Python that implements each method.
+
+.. code-block:: python
+
+    # namespace::class_name
+    @torch._library.register_fake_class("MyCustomClass::TensorQueue")
+    class FakeTensorQueue:
+        def __init__(
+            self,
+            queue: List[torch.Tensor],
+            init_tensor_: torch.Tensor
+        ) -> None:
+            self.queue = queue
+            self.init_tensor_ = init_tensor_
+
+        def push(self, tensor: torch.Tensor) -> None:
+            self.queue.append(tensor)
+
+        def pop(self) -> torch.Tensor:
+            if len(self.queue) > 0:
+                return self.queue.pop(0)
+            return self.init_tensor_
+
+**Step 2b**: Implement an ``__obj_unflatten__`` classmethod in Python.
+
+.. code-block:: python
+
+    # namespace::class_name
+    @torch._library.register_fake_class("MyCustomClass::TensorQueue")
+    class FakeTensorQueue:
+        ...
+        @classmethod
+        def __obj_unflatten__(cls, flattened_tq):
+            return cls(**dict(flattened_tq))
+
+
+That’s it! Now we can create a module that uses this object and run it with ``torch.compile`` or ``torch.export``.
+
+.. code-block:: python
+
+    import torch
+
+    torch.classes.load_library("build/libcustom_class.so")
+    tq = torch.classes.MyCustomClass.TensorQueue(torch.empty(0).fill_(-1))
+
+    class Mod(torch.nn.Module):
+        def forward(self, tq, x):
+            tq.push(x.sin())
+            tq.push(x.cos())
+            poped_t = tq.pop()
+            assert torch.allclose(poped_t, x.sin())
+            return tq, poped_t
+
+    tq, poped_t = torch.compile(Mod(), backend="eager", fullgraph=True)(tq, torch.randn(2, 3))
+    assert tq.size() == 1
+
+    exported_program = torch.export.export(Mod(), (tq, torch.randn(2, 3),), strict=False)
+    exported_program.module()(tq, torch.randn(2, 3))
+
+We can also implement custom ops that take custom classes as inputs. For
+example, we could register a custom op ``for_each_add_(tq, tensor)``
+
+.. code-block:: cpp
+
+    struct TensorQueue : torch::CustomClassHolder {
+        ...
+        void for_each_add_(at::Tensor inc) {
+            for (auto& t : queue_) {
+                t.add_(inc);
+            }
+        }
+        ...
+    }
+
+
+    TORCH_LIBRARY_FRAGMENT(MyCustomClass, m) {
+        m.class_<TensorQueue>("TensorQueue")
+            ...
+            .def("for_each_add_", &TensorQueue::for_each_add_);
+
+        m.def(
+            "for_each_add_(__torch__.torch.classes.MyCustomClass.TensorQueue foo, Tensor inc) -> ()");
+    }
+
+    void for_each_add_(c10::intrusive_ptr<TensorQueue> tq, at::Tensor inc) {
+        tq->for_each_add_(inc);
+    }
+
+    TORCH_LIBRARY_IMPL(MyCustomClass, CPU, m) {
+        m.impl("for_each_add_", for_each_add_);
+    }
+
+
+Since the fake class is implemented in python, we require the fake
+implementation of custom op must also be registered in python:
+
+.. code-block:: python
+
+    @torch.library.register_fake("MyCustomClass::for_each_add_")
+    def fake_for_each_add_(tq, inc):
+        tq.for_each_add_(inc)
+
+After re-compilation, we can export the custom op with:
+
+.. code-block:: python
+
+    class ForEachAdd(torch.nn.Module):
+        def forward(self, tq: torch.ScriptObject, a: torch.Tensor) -> torch.ScriptObject:
+            torch.ops.MyCustomClass.for_each_add_(tq, a)
+            return tq
+
+    mod = ForEachAdd()
+    tq = empty_tensor_queue()
+    qlen = 10
+    for i in range(qlen):
+        tq.push(torch.zeros(1))
+
+    ep = torch.export.export(mod, (tq, torch.ones(1)), strict=False)
+
+Why do we need to make a Fake Class?
+------------------------------------
+
+Tracing with real custom object has several major downsides:
+
+1. Operators on real objects can be time consuming e.g. the custom object
+   might be reading from the network or loading data from the disk.
+
+2. We don’t want to mutate the real custom object or create side-effects to the environment while tracing.
+
+3. It cannot support dynamic shapes.
+
+However, it may be difficult for users to write a fake class, e.g. if the
+original class uses some third-party library that determines the output shape of
+the methods, or is complicated and written by others. In such cases, users can
+disable the fakification requirement by defining a ``tracing_mode`` method to
+return ``"real"``:
+
+.. code-block:: cpp
+
+    std::string tracing_mode() {
+        return "real";
+    }
+
+
+A caveat of fakification is regarding **tensor aliasing.** We assume that no
+tensors within a torchbind object aliases a tensor outside of the torchbind
+object. Therefore, mutating one of these tensors will result in undefined
+behavior.
diff --git a/advanced_source/custom_classes.rst b/advanced_source/custom_classes.rst
new file mode 100644
index 00000000000..014bac2eebf
--- /dev/null
+++ b/advanced_source/custom_classes.rst
@@ -0,0 +1,231 @@
+Extending PyTorch with Custom C++ Classes
+===============================================
+
+
+This tutorial introduces an API for binding C++ classes into PyTorch.
+The API is very similar to
+`pybind11 <https://github.com/pybind/pybind11>`_, and most of the concepts will transfer
+over if you're familiar with that system.
+
+Implementing and Binding the Class in C++
+-----------------------------------------
+
+For this tutorial, we are going to define a simple C++ class that maintains persistent
+state in a member variable.
+
+.. literalinclude:: ../advanced_source/custom_classes/custom_class_project/class.cpp
+  :language: cpp
+  :start-after: BEGIN class
+  :end-before: END class
+
+There are several things to note:
+
+- ``torch/custom_class.h`` is the header you need to include to extend PyTorch
+  with your custom class.
+- Notice that whenever we are working with instances of the custom
+  class, we do it via instances of ``c10::intrusive_ptr<>``. Think of ``intrusive_ptr``
+  as a smart pointer like ``std::shared_ptr``, but the reference count is stored
+  directly in the object, as opposed to a separate metadata block (as is done in
+  ``std::shared_ptr``.  ``torch::Tensor`` internally uses the same pointer type;
+  and custom classes have to also use this pointer type so that we can
+  consistently manage different object types.
+- The second thing to notice is that the user-defined class must inherit from
+  ``torch::CustomClassHolder``. This ensures that the custom class has space to
+  store the reference count.
+
+Now let's take a look at how we will make this class visible to PyTorch, a process called
+*binding* the class:
+
+.. literalinclude:: ../advanced_source/custom_classes/custom_class_project/class.cpp
+  :language: cpp
+  :start-after: BEGIN binding
+  :end-before: END binding
+  :append:
+      ;
+    }
+
+
+
+Building the Example as a C++ Project With CMake
+------------------------------------------------
+
+Now, we're going to build the above C++ code with the `CMake
+<https://cmake.org>`_ build system. First, take all the C++ code
+we've covered so far and place it in a file called ``class.cpp``.
+Then, write a simple ``CMakeLists.txt`` file and place it in the
+same directory. Here is what ``CMakeLists.txt`` should look like:
+
+.. literalinclude:: ../advanced_source/custom_classes/custom_class_project/CMakeLists.txt
+  :language: cmake
+
+Also, create a ``build`` directory. Your file tree should look like this::
+
+  custom_class_project/
+    class.cpp
+    CMakeLists.txt
+    build/
+
+Go ahead and invoke cmake and then make to build the project:
+
+.. code-block:: shell
+
+  $ cd build
+  $ cmake -DCMAKE_PREFIX_PATH="$(python -c 'import torch.utils; print(torch.utils.cmake_prefix_path)')" ..
+    -- The C compiler identification is GNU 7.3.1
+    -- The CXX compiler identification is GNU 7.3.1
+    -- Check for working C compiler: /opt/rh/devtoolset-7/root/usr/bin/cc
+    -- Check for working C compiler: /opt/rh/devtoolset-7/root/usr/bin/cc -- works
+    -- Detecting C compiler ABI info
+    -- Detecting C compiler ABI info - done
+    -- Detecting C compile features
+    -- Detecting C compile features - done
+    -- Check for working CXX compiler: /opt/rh/devtoolset-7/root/usr/bin/c++
+    -- Check for working CXX compiler: /opt/rh/devtoolset-7/root/usr/bin/c++ -- works
+    -- Detecting CXX compiler ABI info
+    -- Detecting CXX compiler ABI info - done
+    -- Detecting CXX compile features
+    -- Detecting CXX compile features - done
+    -- Looking for pthread.h
+    -- Looking for pthread.h - found
+    -- Looking for pthread_create
+    -- Looking for pthread_create - not found
+    -- Looking for pthread_create in pthreads
+    -- Looking for pthread_create in pthreads - not found
+    -- Looking for pthread_create in pthread
+    -- Looking for pthread_create in pthread - found
+    -- Found Threads: TRUE
+    -- Found torch: /torchbind_tutorial/libtorch/lib/libtorch.so
+    -- Configuring done
+    -- Generating done
+    -- Build files have been written to: /torchbind_tutorial/build
+  $ make -j
+    Scanning dependencies of target custom_class
+    [ 50%] Building CXX object CMakeFiles/custom_class.dir/class.cpp.o
+    [100%] Linking CXX shared library libcustom_class.so
+    [100%] Built target custom_class
+
+What you'll find is there is now (among other things) a dynamic library
+file present in the build directory. On Linux, this is probably named
+``libcustom_class.so``. So the file tree should look like::
+
+  custom_class_project/
+    class.cpp
+    CMakeLists.txt
+    build/
+      libcustom_class.so
+
+Using the C++ Class from Python
+-----------------------------------------------
+
+Now that we have our class and its registration compiled into an ``.so`` file,
+we can load that `.so` into Python and try it out. Here's a script that
+demonstrates that:
+
+.. literalinclude:: ../advanced_source/custom_classes/custom_class_project/custom_test.py
+  :language: python
+
+
+Defining Serialization/Deserialization Methods for Custom C++ Classes
+---------------------------------------------------------------------
+
+If you try to save a ``ScriptModule`` with a custom-bound C++ class as
+an attribute, you'll get the following error:
+
+.. literalinclude:: ../advanced_source/custom_classes/custom_class_project/export_attr.py
+  :language: python
+
+.. code-block:: shell
+
+  $ python export_attr.py
+  RuntimeError: Cannot serialize custom bound C++ class __torch__.torch.classes.my_classes.MyStackClass. Please define serialization methods via def_pickle for this class. (pushIValueImpl at ../torch/csrc/jit/pickler.cpp:128)
+
+This is because PyTorch cannot automatically figure out what information
+save from your C++ class. You must specify that manually. The way to do that
+is to define ``__getstate__`` and ``__setstate__`` methods on the class using
+the special ``def_pickle`` method on ``class_``.
+
+.. note::
+  The semantics of ``__getstate__`` and ``__setstate__`` are
+  equivalent to that of the Python pickle module. You can
+  `read more <https://github.com/pytorch/pytorch/blob/master/torch/csrc/jit/docs/serialization.md#getstate-and-setstate>`_
+  about how we use these methods.
+
+Here is an example of the ``def_pickle`` call we can add to the registration of
+``MyStackClass`` to include serialization methods:
+
+.. literalinclude:: ../advanced_source/custom_classes/custom_class_project/class.cpp
+  :language: cpp
+  :start-after: BEGIN def_pickle
+  :end-before: END def_pickle
+
+.. note::
+  We take a different approach from pybind11 in the pickle API. Whereas pybind11
+  as a special function ``pybind11::pickle()`` which you pass into ``class_::def()``,
+  we have a separate method ``def_pickle`` for this purpose. This is because the
+  name ``torch::jit::pickle`` was already taken, and we didn't want to cause confusion.
+
+Once we have defined the (de)serialization behavior in this way, our script can
+now run successfully:
+
+.. code-block:: shell
+
+  $ python ../export_attr.py
+  testing
+
+Defining Custom Operators that Take or Return Bound C++ Classes
+---------------------------------------------------------------
+
+Once you've defined a custom C++ class, you can also use that class
+as an argument or return from a custom operator (i.e. free functions). Suppose
+you have the following free function:
+
+.. literalinclude:: ../advanced_source/custom_classes/custom_class_project/class.cpp
+  :language: cpp
+  :start-after: BEGIN free_function
+  :end-before: END free_function
+
+You can register it running the following code inside your ``TORCH_LIBRARY``
+block:
+
+.. literalinclude:: ../advanced_source/custom_classes/custom_class_project/class.cpp
+  :language: cpp
+  :start-after: BEGIN def_free
+  :end-before: END def_free
+
+Once this is done, you can use the op like the following example:
+
+.. code-block:: python
+
+  class TryCustomOp(torch.nn.Module):
+      def __init__(self):
+          super(TryCustomOp, self).__init__()
+          self.f = torch.classes.my_classes.MyStackClass(["foo", "bar"])
+
+      def forward(self):
+          return torch.ops.my_classes.manipulate_instance(self.f)
+
+.. note::
+
+  Registration of an operator that takes a C++ class as an argument requires that
+  the custom class has already been registered.  You can enforce this by
+  making sure the custom class registration and your free function definitions
+  are in the same ``TORCH_LIBRARY`` block, and that the custom class
+  registration comes first.  In the future, we may relax this requirement,
+  so that these can be registered in any order.
+
+
+Conclusion
+----------
+
+This tutorial walked you through how to expose a C++ class to PyTorch, how to
+register its methods, how to use that class from Python, and how to save and
+load code using the class and run that code in a standalone C++ process. You
+are now ready to extend your PyTorch models with C++ classes that interface
+with third party C++ libraries or implement any other use case that requires
+the lines between Python and C++ to blend smoothly.
+
+As always, if you run into any problems or have questions, you can use our
+`forum <https://discuss.pytorch.org/>`_ or `GitHub issues
+<https://github.com/pytorch/pytorch/issues>`_ to get in touch. Also, our
+`frequently asked questions (FAQ) page
+<https://pytorch.org/cppdocs/notes/faq.html>`_ may have helpful information.
diff --git a/advanced_source/custom_classes/CMakeLists.txt b/advanced_source/custom_classes/CMakeLists.txt
new file mode 100644
index 00000000000..6a1eb3e87fa
--- /dev/null
+++ b/advanced_source/custom_classes/CMakeLists.txt
@@ -0,0 +1,15 @@
+cmake_minimum_required(VERSION 3.1 FATAL_ERROR)
+project(infer)
+
+find_package(Torch REQUIRED)
+
+add_subdirectory(custom_class_project)
+
+# Define our library target
+add_executable(infer infer.cpp)
+set(CMAKE_CXX_STANDARD 14)
+# Link against LibTorch
+target_link_libraries(infer "${TORCH_LIBRARIES}")
+# This is where we link in our libcustom_class code, making our
+# custom class available in our binary.
+target_link_libraries(infer -Wl,--no-as-needed custom_class)
diff --git a/advanced_source/custom_classes/custom_class_project/CMakeLists.txt b/advanced_source/custom_classes/custom_class_project/CMakeLists.txt
new file mode 100644
index 00000000000..bb3d41aa997
--- /dev/null
+++ b/advanced_source/custom_classes/custom_class_project/CMakeLists.txt
@@ -0,0 +1,10 @@
+cmake_minimum_required(VERSION 3.1 FATAL_ERROR)
+project(custom_class)
+
+find_package(Torch REQUIRED)
+
+# Define our library target
+add_library(custom_class SHARED class.cpp)
+set(CMAKE_CXX_STANDARD 14)
+# Link against LibTorch
+target_link_libraries(custom_class "${TORCH_LIBRARIES}")
diff --git a/advanced_source/custom_classes/custom_class_project/class.cpp b/advanced_source/custom_classes/custom_class_project/class.cpp
new file mode 100644
index 00000000000..dc89a3ecb2e
--- /dev/null
+++ b/advanced_source/custom_classes/custom_class_project/class.cpp
@@ -0,0 +1,132 @@
+// BEGIN class
+// This header is all you need to do the C++ portions of this
+// tutorial
+#include <torch/script.h>
+// This header is what defines the custom class registration
+// behavior specifically. script.h already includes this, but
+// we include it here so you know it exists in case you want
+// to look at the API or implementation.
+#include <torch/custom_class.h>
+
+#include <string>
+#include <vector>
+
+template <class T>
+struct MyStackClass : torch::CustomClassHolder {
+  std::vector<T> stack_;
+  MyStackClass(std::vector<T> init) : stack_(init.begin(), init.end()) {}
+
+  void push(T x) {
+    stack_.push_back(x);
+  }
+  T pop() {
+    auto val = stack_.back();
+    stack_.pop_back();
+    return val;
+  }
+
+  c10::intrusive_ptr<MyStackClass> clone() const {
+    return c10::make_intrusive<MyStackClass>(stack_);
+  }
+
+  void merge(const c10::intrusive_ptr<MyStackClass>& c) {
+    for (auto& elem : c->stack_) {
+      push(elem);
+    }
+  }
+};
+// END class
+
+// BEGIN free_function
+c10::intrusive_ptr<MyStackClass<std::string>> manipulate_instance(const c10::intrusive_ptr<MyStackClass<std::string>>& instance) {
+  instance->pop();
+  return instance;
+}
+// END free_function
+
+// BEGIN binding
+// Notice a few things:
+// - We pass the class to be registered as a template parameter to
+//   `torch::class_`. In this instance, we've passed the
+//   specialization of the MyStackClass class ``MyStackClass<std::string>``.
+//   In general, you cannot register a non-specialized template
+//   class. For non-templated classes, you can just pass the
+//   class name directly as the template parameter.
+// - The arguments passed to the constructor make up the "qualified name"
+//   of the class. In this case, the registered class will appear in
+//   Python and C++ as `torch.classes.my_classes.MyStackClass`. We call
+//   the first argument the "namespace" and the second argument the
+//   actual class name.
+TORCH_LIBRARY(my_classes, m) {
+  m.class_<MyStackClass<std::string>>("MyStackClass")
+    // The following line registers the contructor of our MyStackClass
+    // class that takes a single `std::vector<std::string>` argument,
+    // i.e. it exposes the C++ method `MyStackClass(std::vector<T> init)`.
+    // Currently, we do not support registering overloaded
+    // constructors, so for now you can only `def()` one instance of
+    // `torch::init`.
+    .def(torch::init<std::vector<std::string>>())
+    // The next line registers a stateless (i.e. no captures) C++ lambda
+    // function as a method. Note that a lambda function must take a
+    // `c10::intrusive_ptr<YourClass>` (or some const/ref version of that)
+    // as the first argument. Other arguments can be whatever you want.
+    .def("top", [](const c10::intrusive_ptr<MyStackClass<std::string>>& self) {
+      return self->stack_.back();
+    })
+    // The following four lines expose methods of the MyStackClass<std::string>
+    // class as-is. `torch::class_` will automatically examine the
+    // argument and return types of the passed-in method pointers and
+    // expose these to Python and TorchScript accordingly. Finally, notice
+    // that we must take the *address* of the fully-qualified method name,
+    // i.e. use the unary `&` operator, due to C++ typing rules.
+    .def("push", &MyStackClass<std::string>::push)
+    .def("pop", &MyStackClass<std::string>::pop)
+    .def("clone", &MyStackClass<std::string>::clone)
+    .def("merge", &MyStackClass<std::string>::merge)
+// END binding
+#ifndef NO_PICKLE
+// BEGIN def_pickle
+    // class_<>::def_pickle allows you to define the serialization
+    // and deserialization methods for your C++ class.
+    // Currently, we only support passing stateless lambda functions
+    // as arguments to def_pickle
+    .def_pickle(
+          // __getstate__
+          // This function defines what data structure should be produced
+          // when we serialize an instance of this class. The function
+          // must take a single `self` argument, which is an intrusive_ptr
+          // to the instance of the object. The function can return
+          // any type that is supported as a return value of the TorchScript
+          // custom operator API. In this instance, we've chosen to return
+          // a std::vector<std::string> as the salient data to preserve
+          // from the class.
+          [](const c10::intrusive_ptr<MyStackClass<std::string>>& self)
+              -> std::vector<std::string> {
+            return self->stack_;
+          },
+          // __setstate__
+          // This function defines how to create a new instance of the C++
+          // class when we are deserializing. The function must take a
+          // single argument of the same type as the return value of
+          // `__getstate__`. The function must return an intrusive_ptr
+          // to a new instance of the C++ class, initialized however
+          // you would like given the serialized state.
+          [](std::vector<std::string> state)
+              -> c10::intrusive_ptr<MyStackClass<std::string>> {
+            // A convenient way to instantiate an object and get an
+            // intrusive_ptr to it is via `make_intrusive`. We use
+            // that here to allocate an instance of MyStackClass<std::string>
+            // and call the single-argument std::vector<std::string>
+            // constructor with the serialized state.
+            return c10::make_intrusive<MyStackClass<std::string>>(std::move(state));
+          });
+// END def_pickle
+#endif // NO_PICKLE
+
+// BEGIN def_free
+    m.def(
+      "manipulate_instance(__torch__.torch.classes.my_classes.MyStackClass x) -> __torch__.torch.classes.my_classes.MyStackClass Y",
+      manipulate_instance
+    );
+// END def_free
+}
diff --git a/advanced_source/custom_classes/custom_class_project/custom_test.py b/advanced_source/custom_classes/custom_class_project/custom_test.py
new file mode 100644
index 00000000000..1deda445310
--- /dev/null
+++ b/advanced_source/custom_classes/custom_class_project/custom_test.py
@@ -0,0 +1,53 @@
+import torch
+
+# `torch.classes.load_library()` allows you to pass the path to your .so file
+# to load it in and make the custom C++ classes available to both Python and
+# TorchScript
+torch.classes.load_library("build/libcustom_class.so")
+# You can query the loaded libraries like this:
+print(torch.classes.loaded_libraries)
+# prints {'/custom_class_project/build/libcustom_class.so'}
+
+# We can find and instantiate our custom C++ class in python by using the
+# `torch.classes` namespace:
+#
+# This instantiation will invoke the MyStackClass(std::vector<T> init)
+# constructor we registered earlier
+s = torch.classes.my_classes.MyStackClass(["foo", "bar"])
+
+# We can call methods in Python
+s.push("pushed")
+assert s.pop() == "pushed"
+
+# Test custom operator
+s.push("pushed")
+torch.ops.my_classes.manipulate_instance(s)  # acting as s.pop()
+assert s.top() == "bar"
+
+# Returning and passing instances of custom classes works as you'd expect
+s2 = s.clone()
+s.merge(s2)
+for expected in ["bar", "foo", "bar", "foo"]:
+    assert s.pop() == expected
+
+# We can also use the class in TorchScript
+# For now, we need to assign the class's type to a local in order to
+# annotate the type on the TorchScript function. This may change
+# in the future.
+MyStackClass = torch.classes.my_classes.MyStackClass
+
+
+@torch.jit.script
+def do_stacks(s: MyStackClass):  # We can pass a custom class instance
+    # We can instantiate the class
+    s2 = torch.classes.my_classes.MyStackClass(["hi", "mom"])
+    s2.merge(s)  # We can call a method on the class
+    # We can also return instances of the class
+    # from TorchScript function/methods
+    return s2.clone(), s2.top()
+
+
+stack, top = do_stacks(torch.classes.my_classes.MyStackClass(["wow"]))
+assert top == "wow"
+for expected in ["wow", "mom", "hi"]:
+    assert stack.pop() == expected
diff --git a/advanced_source/custom_classes/custom_class_project/export_attr.py b/advanced_source/custom_classes/custom_class_project/export_attr.py
new file mode 100644
index 00000000000..9999d5c8183
--- /dev/null
+++ b/advanced_source/custom_classes/custom_class_project/export_attr.py
@@ -0,0 +1,21 @@
+# export_attr.py
+import torch
+
+torch.classes.load_library('build/libcustom_class.so')
+
+
+class Foo(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.stack = torch.classes.my_classes.MyStackClass(["just", "testing"])
+
+    def forward(self, s: str) -> str:
+        return self.stack.pop() + s
+
+
+scripted_foo = torch.jit.script(Foo())
+
+scripted_foo.save('foo.pt')
+loaded = torch.jit.load('foo.pt')
+
+print(loaded.stack.pop())
diff --git a/advanced_source/custom_classes/custom_class_project/save.py b/advanced_source/custom_classes/custom_class_project/save.py
new file mode 100644
index 00000000000..8826f95da7c
--- /dev/null
+++ b/advanced_source/custom_classes/custom_class_project/save.py
@@ -0,0 +1,18 @@
+import torch
+
+torch.classes.load_library('build/libcustom_class.so')
+
+
+class Foo(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, s: str) -> str:
+        stack = torch.classes.my_classes.MyStackClass(["hi", "mom"])
+        return stack.pop() + s
+
+
+scripted_foo = torch.jit.script(Foo())
+print(scripted_foo.graph)
+
+scripted_foo.save('foo.pt')
diff --git a/advanced_source/custom_classes/infer.cpp b/advanced_source/custom_classes/infer.cpp
new file mode 100644
index 00000000000..1ca5b002383
--- /dev/null
+++ b/advanced_source/custom_classes/infer.cpp
@@ -0,0 +1,20 @@
+#include <torch/script.h>
+
+#include <iostream>
+#include <memory>
+
+int main(int argc, const char* argv[]) {
+  torch::jit::Module module;
+  try {
+    // Deserialize the ScriptModule from a file using torch::jit::load().
+    module = torch::jit::load("foo.pt");
+  }
+  catch (const c10::Error& e) {
+    std::cerr << "error loading the model\n";
+    return -1;
+  }
+
+  std::vector<c10::IValue> inputs = {"foobarbaz"};
+  auto output = module.forward(inputs).toString();
+  std::cout << output->string() << std::endl;
+}
diff --git a/advanced_source/custom_classes/run.sh b/advanced_source/custom_classes/run.sh
new file mode 100755
index 00000000000..52c59581309
--- /dev/null
+++ b/advanced_source/custom_classes/run.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+set -ex
+
+rm -rf build
+rm -rf custom_class_project/build
+
+pushd custom_class_project
+  mkdir build
+  (cd build && cmake CXXFLAGS="-DNO_PICKLE" -DCMAKE_PREFIX_PATH="$(python -c 'import torch.utils; print(torch.utils.cmake_prefix_path)')" ..)
+  (cd build && make)
+  python custom_test.py
+  python save.py
+  ! python export_attr.py
+popd
+
+mkdir build
+(cd build && cmake -DCMAKE_PREFIX_PATH="$(python -c 'import torch.utils; print(torch.utils.cmake_prefix_path)')" ..)
+(cd build && make)
+mv custom_class_project/foo.pt build/foo.pt
+(cd build && ./infer)
diff --git a/advanced_source/custom_classes/run2.sh b/advanced_source/custom_classes/run2.sh
new file mode 100755
index 00000000000..d4ef0101a83
--- /dev/null
+++ b/advanced_source/custom_classes/run2.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+set -ex
+
+rm -rf build
+rm -rf custom_class_project/build
+
+pushd custom_class_project
+  mkdir build
+  (cd build && cmake -DCMAKE_PREFIX_PATH="$(python -c 'import torch.utils; print(torch.utils.cmake_prefix_path)')" ..)
+  (cd build && make)
+  python export_attr.py
+popd
diff --git a/advanced_source/custom_ops_landing_page.rst b/advanced_source/custom_ops_landing_page.rst
new file mode 100644
index 00000000000..f05eee43060
--- /dev/null
+++ b/advanced_source/custom_ops_landing_page.rst
@@ -0,0 +1,66 @@
+.. _custom-ops-landing-page:
+
+PyTorch Custom Operators
+===========================
+
+PyTorch offers a large library of operators that work on Tensors (e.g. ``torch.add``,
+``torch.sum``, etc). However, you may wish to bring a new custom operation to PyTorch
+and get it to work with subsystems like ``torch.compile``, autograd, and ``torch.vmap``.
+In order to do so, you must register the custom operation with PyTorch via the Python
+`torch.library docs <https://pytorch.org/docs/stable/library.html>`_ or C++ ``TORCH_LIBRARY``
+APIs.
+
+
+
+Authoring a custom operator from Python
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Please see :ref:`python-custom-ops-tutorial`.
+
+You may wish to author a custom operator from Python (as opposed to C++) if:
+
+- you have a Python function you want PyTorch to treat as an opaque callable, especially with
+  respect to ``torch.compile`` and ``torch.export``.
+- you have some Python bindings to C++/CUDA kernels and want those to compose with PyTorch
+  subsystems (like ``torch.compile`` or ``torch.autograd``)
+- you are using Python (and not a C++-only environment like AOTInductor).
+
+Integrating custom C++ and/or CUDA code with PyTorch
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Please see :ref:`cpp-custom-ops-tutorial`.
+
+.. note::
+
+  ``SYCL`` serves as the backend programming language for Intel GPUs. Integrate custom Sycl code refer to :ref:`cpp-custom-ops-tutorial-sycl`.
+
+You may wish to author a custom operator from C++ (as opposed to Python) if:
+
+- you have custom C++ and/or CUDA code.
+- you plan to use this code with ``AOTInductor`` to do Python-less inference.
+
+The Custom Operators Manual
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+For information not covered in the tutorials and this page, please see
+`The Custom Operators Manual <https://docs.google.com/document/d/1_W62p8WJOQQUzPsJYa7s701JXt0qf2OfLub2sbkHOaU>`_
+(we're working on moving the information to our docs site). We recommend that you
+first read one of the tutorials above and then use the Custom Operators Manual as a reference;
+it is not meant to be read head to toe.
+
+When should I create a Custom Operator?
+---------------------------------------
+If your operation is expressible as a composition of built-in PyTorch operators
+then please write it as a Python function and call it instead of creating a
+custom operator. Use the operator registration APIs to create a custom operator if you
+are calling into some library that PyTorch doesn't understand (e.g. custom C/C++ code,
+a custom CUDA kernel, or Python bindings to C/C++/CUDA extensions).
+
+Why should I create a Custom Operator?
+--------------------------------------
+
+It is possible to use a C/C++/CUDA kernel by grabbing a Tensor's data pointer
+and passing it to a pybind'ed kernel. However, this approach doesn't compose with
+PyTorch subsystems like autograd, torch.compile, vmap, and more. In order
+for an operation to compose with PyTorch subsystems, it must be registered
+via the operator registration APIs.
diff --git a/advanced_source/ddp_pipeline.rst b/advanced_source/ddp_pipeline.rst
new file mode 100644
index 00000000000..bf9e4d28f33
--- /dev/null
+++ b/advanced_source/ddp_pipeline.rst
@@ -0,0 +1,10 @@
+Training Transformer models using Distributed Data Parallel and Pipeline Parallelism
+====================================================================================
+
+This tutorial has been deprecated.
+
+Redirecting to the latest parallelism APIs in 3 seconds...
+
+.. raw:: html
+
+   <meta http-equiv="Refresh" content="3; url='https://pytorch.org/tutorials/beginner/dist_overview.html#parallelism-apis'" />
diff --git a/advanced_source/dispatcher.rst b/advanced_source/dispatcher.rst
new file mode 100644
index 00000000000..4b03803c15b
--- /dev/null
+++ b/advanced_source/dispatcher.rst
@@ -0,0 +1,410 @@
+Registering a Dispatched Operator in C++
+========================================
+
+.. warning::
+
+    This tutorial is deprecated as of PyTorch 2.4. Please see :ref:`custom-ops-landing-page`
+    for the newest up-to-date guides on extending PyTorch with Custom Operators.
+
+The dispatcher is an internal component of PyTorch which is responsible for
+figuring out what code should actually get run when you call a function like
+``torch::add``.  This can be nontrivial, because PyTorch operations need
+to handle a lot of cross-cutting concerns that are "layered" on top of one
+of another.  Here is a sampling of some of the things it handles:
+
+* Switching between the CPU and CUDA implementations of an operator, depending
+  on the devices of the input tensors.
+* Switching between the autograd and backend implementations of an operator,
+  depending on whether or not autograd handling is necessary.
+* Applying autocasting when necessary for automatic mixed precision.
+* Applying batching rules when an operator is run under a ``vmap`` call.
+* Tracing execution of operations, if you are tracing a model for export.
+
+If in your `custom operator code <torch_script_custom_ops>`_ you find yourself
+manually writing if statements to handle these cases, the dispatcher APIs can
+help organize your code.  (Conversely, if your custom operator is very simple
+and is only for CPU inference, you probably don't need to use the dispatcher,
+just use the basic API.)
+
+In this tutorial, we will describe how to structure a custom operator
+registration to use the dispatcher to organize various components.  We'll
+assume that you are familiar with how to
+`register an operator <torch_script_custom_ops>`_ and how to write
+a `custom autograd function <cpp_autograd>`_.
+
+Defining schema and backend implementations
+-------------------------------------------
+
+The general principle behind the dispatcher is that it divides the
+implementation of an operator into multiple kernels, each of which implements
+functionality for a specific *dispatch key*, e.g. CPU, CUDA.  The dispatcher
+determines what the highest priority dispatch key is at the time
+you call an operator (this is done by looking at both the tensor arguments as
+well as some thread local state), and transfers control to the kernel for that
+dispatch key.  The end effect is that when you call an operator, we first
+execute the Autograd kernel, and then we redispatch to the backend kernel
+depending on the device types of the passed in tensors.
+
+Let's take a look at the various parts involved in making this
+happen.  First, we must define the schema for the operator in question.
+Unlike simple pybind11-style operator registration, we don't actually
+provide an implementation of our operator at this point; we just
+provide a schema string specifying the type signature of the operator
+that all of our other kernels will abide by:
+
+.. literalinclude:: ../advanced_source/dispatcher/op.cpp
+  :language: cpp
+  :start-after: BEGIN TORCH_LIBRARY
+  :end-before: END TORCH_LIBRARY
+
+Next, we need to actually provide some implementations of this operator.
+For concreteness, here is a really simple implementation of addition on CPU:
+
+.. literalinclude:: ../advanced_source/dispatcher/op.cpp
+  :language: cpp
+  :start-after: BEGIN myadd_cpu
+  :end-before: END myadd_cpu
+
+We'd like to register this function as an implementation of ``myops::myadd``.
+However, the simple way of registering it (``def("myadd", myadd_cpu)``) would
+register the kernel to run in all cases, even if the tensor is not a CPU
+tensor!  (Internally, we refer to these as "catch-all" kernels, since they
+catch all cases.)  To ensure that ``myadd_cpu`` is only run for
+CPU tensors, we can use the ``TORCH_LIBRARY_IMPL`` macro:
+
+.. literalinclude:: ../advanced_source/dispatcher/op.cpp
+  :language: cpp
+  :start-after: BEGIN TORCH_LIBRARY_IMPL CPU
+  :end-before: END TORCH_LIBRARY_IMPL CPU
+
+The ``TORCH_LIBRARY_IMPL`` lets us register implementations for operators on
+a specific dispatch key (in this case, CPU).  Each call to ``impl``
+associates a CPU kernel with the corresponding operator (which we previously
+defined in the ``TORCH_LIBRARY`` block).  If we also have a CUDA implementation ``myadd_cuda``,
+we can register it in a separate ``TORCH_LIBRARY_IMPL`` block:
+
+.. literalinclude:: ../advanced_source/dispatcher/op.cpp
+  :language: cpp
+  :start-after: BEGIN TORCH_LIBRARY_IMPL CUDA
+  :end-before: END TORCH_LIBRARY_IMPL CUDA
+
+These registrations can be split across files or even across library boundaries; so
+for example, you could have these two ``TORCH_LIBRARY_IMPL`` blocks compiled
+into a separate ``myops_cpu`` and ``myops_cuda`` dynamic libraries.  Generally,
+speaking, the structure of your registrations will look like this:
+
+1. A single ``TORCH_LIBRARY`` that lists every custom operator in your namespace
+   in a centralized place.
+2. A ``TORCH_LIBRARY_IMPL`` per dispatch key that registers implementations for
+   that key (e.g., CPU or CUDA).  If you like, you can further subdivide
+   ``TORCH_LIBRARY_IMPL`` blocks into a block per operator. This is convenient
+   if you have a separate file per operator implementation, but don't want to
+   expose the operators in a header; you can just put the registration in the
+   cpp file that defines your operator.
+
+.. note::
+
+    Did you know that you can also write ``TORCH_LIBRARY_IMPL`` blocks for existing
+    core operators in PyTorch?  This is how XLA support for PyTorch is
+    implemented: the ``torch_xla`` library contains a ``TORCH_LIBRARY_IMPL``
+    that provides implementations for all basic operators on the XLA dispatch
+    key.
+
+
+For operators that do not need autograd
+---------------------------------------
+
+Note: This section only applies to versions of PyTorch ``>= 1.10``.
+
+In the next section, we will discuss how to add autograd support to an operator.
+But for the ops that do not need autograd support, the following kernel should be
+registered improve useability and make your op behave like PyTorch's built-in
+operators.
+
+.. code-block:: cpp
+
+  TORCH_LIBRARY_IMPL(myops, Autograd, m) {
+    m.impl(op, autogradNotImplementedFallback());
+  }
+
+The above lines registers an ``Autograd`` kernel that appends a dummy
+``NotImplemented`` node on forward (preserving the ``require_grad``-ness of the inputs).
+On backward, the ``NotImplemented`` node raises an error. This can be helpful
+for debugging in larger models where previously it can be hard to pin-point
+exactly where the ``requires_grad``-ness is lost during the forward pass.
+
+In-place or view ops
+^^^^^^^^^^^^^^^^^^^^
+
+To ensure correctness and best possible performance, if your op mutates an input
+in-place or returns a tensor that aliases with one of the inputs, two additional
+steps should be taken:
+
+1. Register an ``ADInplaceOrView`` kernel in addition to the ``Autograd`` kernel
+   above. This kernel handles the necessary bookkeeping to ensure the correctness
+   of in-place or view operations. It is important to note that this ADInplaceOrView
+   kernel should only be used with ``autogradNotImplementedFallback``.
+
+.. code-block:: cpp
+
+  TORCH_LIBRARY_IMPL(myops, Autograd, m) {
+    m.impl(op, autogradNotImplementedFallback());
+  }
+  TORCH_LIBRARY_IMPL(myops, ADInplaceOrView, m) {
+    m.impl(op, autogradNotImplementedInplaceOrViewFallback());
+  }
+
+2. The ``Autograd`` or ``ADInplaceOrView`` boxed kernels registered above
+   rely on operator schema information in their logi. If your op mutates an input
+   in-place or returns a tensor that aliases with one of the inputs it is important to
+   ensure that your schema properly reflects this. See
+   `here <https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/README.md>`_
+   for more information on how to annotate the schema.
+
+.. _autograd-support:
+
+Adding autograd support
+-----------------------
+
+At this point, we have an operator with both CPU and CUDA implementations.  How
+can we add autograd support to it?  As you might guess, we will register an
+autograd kernel (similar to what's described in the `custom autograd function <cpp_autograd>`_ tutorial)!
+However, there is a twist: unlike the CPU and CUDA kernels, the autograd kernel
+needs to *redispatch*: it needs to call back into the dispatcher to get to
+the inference kernels, e.g. CPU or CUDA implementations.
+
+Thus, before we write the autograd kernel, let's write a *dispatching function*
+which calls into the dispatcher to find the right kernel for your operator.
+This function constitutes the public C++ API for your operators--in fact, all of
+the tensor functions in PyTorch's C++ API all call the dispatcher in the same
+way under the hood.  Here's what the dispatching function looks like:
+
+.. literalinclude:: ../advanced_source/dispatcher/op.cpp
+  :language: cpp
+  :start-after: BEGIN myadd
+  :end-before: END myadd
+
+Let's break it down:
+
+* In the first line, we look up a typed operator handle from the dispatcher
+  corresponding to the operator that we are going to dispatch to.
+  ``findSchemaOrThrow`` takes two arguments: the (namespace qualified) name
+  of the operator, and the overload name of the operator (typically just
+  the empty string).  ``typed`` casts the dynamically typed handle into
+  a statically typed handle (doing a runtime test to make sure you've given
+  the correct C++ type), so that we can do a normal C++ call on it.  We
+  pass it ``decltype(myadd)`` since the type of the dispatching function is
+  the same as the type of the underlying kernels registered to the dispatcher.
+
+  For performance, this computation is done in a static variable, so that
+  we only need to do the (slow) lookup once.  If you typoed the name of the
+  operator you want to call, this lookup will error the first time you call this
+  function.
+
+* In the second line, we simply ``call`` the operator handle with all of the
+  arguments passed into the dispatching function.  This will actually invoke
+  the dispatcher and in the end control will be transferred to whatever kernel
+  is appropriate for this call.
+
+With the dispatch function in hand, we can now write the autograd kernel:
+
+.. literalinclude:: ../advanced_source/dispatcher/op.cpp
+  :language: cpp
+  :start-after: BEGIN myadd_autograd
+  :end-before: END myadd_autograd
+
+The autograd function is written as normal using ``torch::autograd::Function``,
+except that instead of directly writing the implementation in ``forward()``,
+we:
+
+1. Turn off autograd handling with the ``at::AutoNonVariableTypeMode`` RAII
+   guard, and then
+2. Call the dispatch function ``myadd`` to call back into the dispatcher.
+
+Without (1), your calls will infinite loop (and stack overflow), because
+``myadd`` will send you back to this function (as the highest priority dispatch
+key would still be autograd.) With (1),
+autograd is excluded from the set of dispatch keys under consideration, and
+we will go to the next handlers, which will either be CPU and CUDA.
+
+We can now register this function in the same way we registered the CPU/CUDA
+functions:
+
+.. literalinclude:: ../advanced_source/dispatcher/op.cpp
+  :language: cpp
+  :start-after: BEGIN TORCH_LIBRARY_IMPL Autograd
+  :end-before: END TORCH_LIBRARY_IMPL Autograd
+
+
+.. note::
+
+    In this example we register the kernel to ``Autograd``, which installs it as the
+    autograd kernel for all backends. You can also register optimized kernels for specific
+    backends by using the corresponding backend-specific dispatch key - for example,
+    ``AutogradCPU`` or ``AutogradCUDA``. To explore these and other dispatch key
+    options in more detail, check out the ``PythonDispatcher`` tool provided in
+    `torch/_python_dispatcher.py <https://github.com/pytorch/pytorch/blob/master/torch/_python_dispatcher.py>`_.
+
+
+Going beyond autograd
+---------------------
+
+In some sense, the dispatcher isn't doing all that much: all it does is
+implement a glorified if-statement, along the lines of this:
+
+.. code-block:: cpp
+
+    class MyAddFunction : ... {
+    public:
+      static Tensor forward(
+        AutogradContext *ctx, torch::Tensor self, torch::Tensor other) {
+
+        if (self.device().type() == DeviceType::CPU) {
+          return add_cpu(self, other);
+        } else if (self.device().type() == DeviceType::CUDA) {
+          return add_cuda(self, other);
+        } else {
+          TORCH_CHECK(0, "Unsupported device ", self.device().type());
+        }
+      }
+      ...
+    }
+
+So why use the dispatcher?  There are a few reasons:
+
+1. It is decentralized.  You can assemble all of the pieces of an operator
+   (CPU, CUDA, Autograd) without having to write a single, centralized
+   if statement that refers to all of them.  Importantly, third parties can
+   register extra implementations for other aspects without having to patch the
+   original definition of an operator.  We'll talk more about extending the
+   dispatcher in `extending dispatcher for a new backend <extend_dispatcher>`_.
+
+2. It supports more dispatch keys than CPU, CUDA and Autograd.  You can
+   see a full list of dispatch keys that are currently implemented
+   in PyTorch in ``c10/core/DispatchKey.h``.  These dispatch keys
+   implement a variety of optional functionality for operators, and if you
+   decide you want your custom operator to support this functionality,
+   all you have to register a kernel for the appropriate key.
+
+3. The dispatcher implements support for boxed fallback functions, which
+   are functions that can be implemented once and apply to all operators
+   in the system.  Boxed fallbacks can be used to provide default behavior
+   for a dispatch key; if you use the dispatcher to implement your operator,
+   you also opt into the fallbacks for all of these operations.
+
+Here are some particular dispatch keys which you may need to define an operator
+for.
+
+Autocast
+^^^^^^^^
+
+The Autocast dispatch key implements support for
+`automatic mixed precision (AMP) <https://pytorch.org/docs/stable/amp.html>`_.
+An autocast wrapper kernel typically casts incoming ``float16`` or ``float32`` CUDA tensors
+to some preferred precision before running the op.
+For example, matmuls and convolutions on floating-point CUDA tensors usually run faster
+and use less memory in ``float16`` without impairing convergence.
+Autocast wrappers only have an effect in
+`autocast-enabled contexts <https://pytorch.org/docs/stable/amp.html#torch.cuda.amp.autocast>`_.
+
+Here's an autocast wrapper for a hypothetical custom matmul, along with its registration:
+
+.. code-block:: cpp
+
+    // Autocast-specific helper functions
+    #include <ATen/autocast_mode.h>
+
+    Tensor mymatmul_autocast(const Tensor& self, const Tensor& other) {
+      c10::impl::ExcludeDispatchKeyGuard no_autocast(c10::DispatchKey::Autocast);
+      return mymatmul(at::autocast::cached_cast(at::kHalf, self),
+                      at::autocast::cached_cast(at::kHalf, other));
+    }
+
+    TORCH_LIBRARY_IMPL(myops, Autocast, m) {
+      m.impl("mymatmul", mymatmul_autocast);
+    }
+
+``cached_cast(kHalf, tensor)`` casts ``tensor`` to ``float16`` if ``tensor`` is CUDA and ``float32``,
+otherwise, it leaves ``tensor`` unchanged (c.f. the
+`eligibility policy <https://pytorch.org/docs/stable/amp.html#op-eligibility>`_ for natively autocasted ops).
+This ensures if the network calls ``mymatmul`` on any mixture of ``float16`` and ``float32`` CUDA tensors,
+``mymatmul`` runs in ``float16``.  Meanwhile, calls to ``mymatmul`` with non-CUDA, integer-type, or ``float64``
+inputs are unaffected.  Using ``cached_cast`` to follow the native eligibility policy in your own autocast wrapper
+is recommended, but not required.  For example, if you wanted to force ``float16`` execution for all input types,
+you could ``return mymatmul(self.half(), other.half());`` instead of using ``cached_cast``.
+
+Notice that, like our autograd kernels, we exclude the ``Autocast`` key from
+dispatch before redispatching.
+
+By default, if no autocast wrapper is provided,
+we fallthrough directly to the regular operator implementation (no
+autocasting occurs).  (We didn't use ``myadd`` for this example, since pointwise
+addition doesn't need autocasting and should just fall through.)
+
+When should an autocast wrapper be registered? Unfortunately, there aren't
+cut-and-dried rules for an op's preferred precision.  You can
+get a sense for some native ops' preferred precisions by looking at the
+`cast lists <https://pytorch.org/docs/master/amp.html#op-specific-behavior>`_.
+General guidance:
+
+* Ops that do reductions should probably execute in ``float32``,
+* Any op that does a convolution or gemm under the hood should
+  probably execute in ``float16``, and
+* Other ops with multiple floating-point tensor inputs should standardize
+  them to a common precision (unless the implementation supports inputs with different precisions).
+
+If your custom op falls into the third category, the ``promote_type`` template
+helps figure out the widest floating-point type present among input tensors, which is
+the safest choice for the execution type:
+
+.. code-block:: cpp
+
+    #include <ATen/autocast_mode.h>
+
+    Tensor my_multiple_input_op_autocast(const Tensor& t0, const Tensor& t1) {
+      c10::impl::ExcludeDispatchKeyGuard no_autocast(c10::DispatchKey::Autocast);
+      // The required at::kHalf argument is an optimistic initial guess.
+      auto exec_type = at::autocast::promote_type(at::kHalf, t0, t1);
+      return my_multiple_input_op(at::autocast::cached_cast(exec_type, t0),
+                                  at::autocast::cached_cast(exec_type, t1));
+    }
+
+If your custom op is :ref:`autograd-enabled<autograd-support>`, you only need to write and register
+an autocast wrapper for the same name onto which the autograd wrapper is registered.
+For example, if you wanted an autocast wrapper for the ``myadd`` function shown
+in the autograd section, all you'd need is
+
+.. code-block:: cpp
+
+    Tensor myadd_autocast(const Tensor& self, const Tensor& other) {
+      c10::impl::ExcludeDispatchKeyGuard no_autocast(c10::DispatchKey::Autocast);
+      return myadd(at::autocast::cached_cast(<desired dtype>, self),
+                   at::autocast::cached_cast(<desired dtype>, other));
+    }
+
+    TORCH_LIBRARY_IMPL(myops, Autocast, m) {
+      m.impl("myadd", myadd_autocast);
+    }
+
+There are no separate gymnastics to make the backward method autocast compatible.
+However, the backward method defined in your custom autograd function will run in the same
+dtype as autocast sets for the forward method, so you should choose a ``<desired dtype>``
+suitable for both your forward and backward methods.
+
+Batched
+^^^^^^^
+
+Batched tensors allow you to write your code in a per-example manner, and then
+have them be automatically batched when run under a ``vmap`` invocation.  The
+API for writing batching rules is currently under development, but once it is
+stabilized, you can add support for ``vmap`` for your operators by registering
+a kernel at the Batched dispatch key.
+
+Tracer
+^^^^^^
+
+The Tracer dispatch key implements support for recording invocations of operators
+into a trace when you run ``torch.jit.trace``.  We intend to provide a
+boxed fallback that will implement tracing for arbitrary operations,
+see `issue #41478 <https://github.com/pytorch/pytorch/issues/41478>`_ to track
+progress.
diff --git a/advanced_source/dispatcher/CMakeLists.txt b/advanced_source/dispatcher/CMakeLists.txt
new file mode 100644
index 00000000000..0ef448a9644
--- /dev/null
+++ b/advanced_source/dispatcher/CMakeLists.txt
@@ -0,0 +1,8 @@
+cmake_minimum_required(VERSION 3.1 FATAL_ERROR)
+project(dispatcher)
+
+find_package(Torch REQUIRED)
+
+add_library(dispatcher SHARED op.cpp)
+target_compile_features(dispatcher PRIVATE cxx_std_14)
+target_link_libraries(dispatcher "${TORCH_LIBRARIES}")
diff --git a/advanced_source/dispatcher/op.cpp b/advanced_source/dispatcher/op.cpp
new file mode 100644
index 00000000000..c3a90aed448
--- /dev/null
+++ b/advanced_source/dispatcher/op.cpp
@@ -0,0 +1,105 @@
+#include <torch/torch.h>
+#include <torch/script.h>
+
+#include <ATen/NamedTensorUtils.h>
+
+using torch::Tensor;
+using torch::DeviceType;
+using torch::autograd::tensor_list;
+using torch::autograd::AutogradContext;
+
+// BEGIN myadd
+Tensor myadd(const Tensor& self, const Tensor& other) {
+  static auto op = torch::Dispatcher::singleton()
+    .findSchemaOrThrow("myops::myadd", "")
+    .typed<decltype(myadd)>();
+  return op.call(self, other);
+}
+// END myadd
+
+// BEGIN TORCH_LIBRARY
+TORCH_LIBRARY(myops, m) {
+  m.def("myadd(Tensor self, Tensor other) -> Tensor");
+}
+// END TORCH_LIBRARY
+
+// BEGIN myadd_cpu
+Tensor myadd_cpu(const Tensor& self_, const Tensor& other_) {
+  TORCH_CHECK(self_.sizes() == other_.sizes());
+  TORCH_INTERNAL_ASSERT(self_.device().type() == DeviceType::CPU);
+  TORCH_INTERNAL_ASSERT(other_.device().type() == DeviceType::CPU);
+  Tensor self = self_.contiguous();
+  Tensor other = other_.contiguous();
+  Tensor result = torch::empty(self.sizes(), self.options());
+  const float* self_ptr = self.data_ptr<float>();
+  const float* other_ptr = other.data_ptr<float>();
+  float* result_ptr = result.data_ptr<float>();
+  for (int64_t i = 0; i < result.numel(); i++) {
+    result_ptr[i] = self_ptr[i] + other_ptr[i];
+  }
+  return result;
+}
+// END myadd_cpu
+
+// BEGIN TORCH_LIBRARY_IMPL CPU
+TORCH_LIBRARY_IMPL(myops, CPU, m) {
+  m.impl("myadd", myadd_cpu);
+}
+// END TORCH_LIBRARY_IMPL CPU
+
+Tensor myadd_cuda(const Tensor& self, const Tensor& other) {
+  // Insert your CUDA implementation here
+  TORCH_CHECK(0, "CUDA not yet implemented");
+}
+
+// BEGIN TORCH_LIBRARY_IMPL CUDA
+TORCH_LIBRARY_IMPL(myops, CUDA, m) {
+  m.impl("myadd", myadd_cuda);
+}
+// END TORCH_LIBRARY_IMPL CUDA
+
+// BEGIN myadd_autograd
+class MyAddFunction : public torch::autograd::Function<MyAddFunction> {
+ public:
+  static Tensor forward(
+      AutogradContext *ctx, torch::Tensor self, torch::Tensor other) {
+    at::AutoNonVariableTypeMode g;
+    return myadd(self, other);
+  }
+
+  static tensor_list backward(AutogradContext *ctx, tensor_list grad_outputs) {
+    auto grad_output = grad_outputs[0];
+    return {grad_output, grad_output};
+  }
+};
+
+Tensor myadd_autograd(const Tensor& self, const Tensor& other) {
+  return MyAddFunction::apply(self, other)[0];
+}
+// END myadd_autograd
+
+// BEGIN TORCH_LIBRARY_IMPL Autograd
+TORCH_LIBRARY_IMPL(myops, Autograd, m) {
+  m.impl("myadd", myadd_autograd);
+}
+// END TORCH_LIBRARY_IMPL Autograd
+
+#if 0
+// BEGIN TORCH_LIBRARY_IMPL Named
+Tensor myadd_named(const Tensor& self, const Tensor& other) {
+  // TODO: shouldn't need to do size check here
+  TORCH_CHECK(self.sizes() == other.sizes());
+  auto maybe_outnames = at::unify_from_right(self.names(), other.names());
+  auto result = ([&]() {
+    at::NoNamesGuard guard;
+    return myadd(self, other);
+  })();
+  at::namedinference::propagate_names_if_nonempty(result, maybe_outnames);
+  return result;
+}
+
+TORCH_LIBRARY_IMPL(myops, Named, m) {
+  m.impl("myadd", myadd_named);
+}
+// END TORCH_LIBRARY_IMPL Named
+#endif
diff --git a/advanced_source/dispatcher/test.py b/advanced_source/dispatcher/test.py
new file mode 100644
index 00000000000..cd35b05a47a
--- /dev/null
+++ b/advanced_source/dispatcher/test.py
@@ -0,0 +1,11 @@
+import torch
+
+torch.ops.load_library("build/libdispatcher.so")
+print(torch.ops.myops.myadd(torch.randn(32, 32), torch.rand(32, 32)))
+"""
+# Doesn't currently work, because Python frontend on torch.ops doesn't
+# support names (for not a good reason?)
+x = torch.randn(32, 32, names=('A', 'B'))
+y = torch.rand(32, 32, names=('A', 'B'))
+print(torch.ops.myops.myadd(x, y))
+"""
diff --git a/advanced_source/extend_dispatcher.rst b/advanced_source/extend_dispatcher.rst
new file mode 100644
index 00000000000..12f15355f5f
--- /dev/null
+++ b/advanced_source/extend_dispatcher.rst
@@ -0,0 +1,380 @@
+Extending dispatcher for a new backend in C++
+=============================================
+
+In this tutorial we will walk through all necessary steps to extend the dispatcher to
+add a new device living outside ``pytorch/pytorch`` repo and maintain it to keep in
+sync with native PyTorch devices.  Here we'll assume that you're familiar with how
+to `register a dispatched operator in C++ <dispatcher>`_ and how to write a
+`custom autograd function <cpp_autograd>`_.
+
+
+.. note::
+
+   This tutorial touches a lot of internal components inside PyTorch which are being actively improved,
+   please expect changes to APIs if you decide to follow this tutorial.  We'll keep this tutorial
+   up to date with the latest APIs.
+
+What's a new backend?
+---------------------
+
+Adding a new backend to PyTorch requires a lot of development and maintenance from backend extenders.
+Before adding a new backend, let's first consider a few common use cases and recommended solutions for them:
+
+* If you have new algorithms for an existing PyTorch operator, send a PR to PyTorch.
+* If you want to propose a new operator, send a feature request/PR to PyTorch.
+* If you want to add support for a new device/hardware like Google TPU and customized chips, which often requires using
+  hardware-specific API to write kernels, follow this tutorial and add a out-of-tree backend to PyTorch.
+* If you want to add support for existing operators but with a different Tensor layout/representation
+  like sparse and quantized, which enforces your kernels to be written in a way that's more efficient
+  given the layout/representation limitation, follow this tutorial and add a out-of-tree backend to PyTorch.
+
+In this tutorial we'll mainly focus on adding a new out-of-tree device below.  Adding out-of-tree support
+for a different tensor layout might share many common steps with devices, but we haven't seen an example of
+such integrations yet so it might require additional work from PyTorch to support it.
+
+Get a dispatch key for your backend
+-----------------------------------
+
+PyTorch operators are implemented in C++ and made available in Python frontend through Python bindings.
+The PyTorch dispatcher divides the implementation of an operator into multiple kernels, each of which is
+associated with a specific dispatch key.  Supporting a new backend in PyTorch essentially means writing
+a kernel for each PyTorch operator in C++ and then registering them to a dispatch key representing your
+customized backend in the dispatcher.
+
+Dispatch key is your identifier in the dispatcher system. The dispatcher looks at the dispatch keys carried on
+input tensors and calls the right kernel accordingly.  PyTorch provides three reserved dispatch keys
+(and their corresponding Autograd keys) for prototyping out-of-tree backend extensions:
+
+* PrivateUse1/AutogradPrivateUse1
+* PrivateUse2/AutogradPrivateUse2
+* PrivateUse3/AutogradPrivateUse3
+
+You can choose any of keys above to prototype your customized backend.
+To create a Tensor on ``PrivateUse1`` backend, you need to set dispatch key in ``TensorImpl`` constructor.
+
+.. code-block:: cpp
+
+  /* Example TensorImpl constructor */
+  TensorImpl(
+      Storage&& storage,
+      DispatchKeySet ks,
+      const caffe2::TypeMeta data_type);
+
+  // To create a TensorImpl on PrivateUse1 backend, pass in the following ks to TensorImpl creation.
+  DispatchKeySet ks = c10::DispatchKeySet{c10::DispatchKey::PrivateUse1, c10::DispatchKey::AutogradPrivateUse1};
+
+
+Note that ``TensorImpl`` class above assumes your Tensor is backed by a storage like CPU/CUDA. We also
+provide ``OpaqueTensorImpl`` for backends without a storage. And you might need to tweak/override certain
+methods to fit your customized hardware.
+One example in pytorch repo is `Vulkan TensorImpl <https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/vulkan/VulkanOpaqueTensorImpl.h>`_.
+
+
+.. note::
+   Once the prototype is done and you plan to do regular releases for your backend extension,  please feel free to
+   submit a PR to ``pytorch/pytorch`` to reserve a dedicated dispatch key for your backend.
+
+
+Get the full list of PyTorch operators
+--------------------------------------
+
+PyTorch provides a full list of extensible C++ operators in generated file
+``build/aten/src/ATen/RegistrationDeclarations.h``.
+This file is only available after building PyTorch from source.
+Here's a snippet of the file:
+
+.. code-block:: cpp
+
+  Tensor abs(const Tensor & self); // {"schema": "aten::abs(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+  Tensor & abs_(Tensor & self); // {"schema": "aten::abs_(Tensor(a!) self) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+  Tensor & abs_out(Tensor & out, const Tensor & self); // {"schema": "aten::abs.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+  Tensor absolute(const Tensor & self); // {"schema": "aten::absolute(Tensor self) -> Tensor", "dispatch": "False", "default": "False"}
+  Tensor & absolute_(Tensor & self); // {"schema": "aten::absolute_(Tensor(a!) self) -> Tensor(a!)", "dispatch": "False", "default": "False"}
+  Tensor & absolute_out(Tensor & out, const Tensor & self); // {"schema": "aten::absolute.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "False"}
+  Tensor angle(const Tensor & self); // {"schema": "aten::angle(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+  Tensor & angle_out(Tensor & out, const Tensor & self); // {"schema": "aten::angle.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+  Tensor sgn(const Tensor & self); // {"schema": "aten::sgn(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+
+There're multiple fields associated with a single operator. Let's break it down using ``abs_out`` as an example:
+
+* ``Tensor & abs_out(Tensor & out, const Tensor & self);`` is the C++ signature of the operator, your C++
+  kernel should match this signature exactly.
+* ``aten::abs.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)`` is the unique schema representing the operator,
+  which also contains aliasing and mutation annotations compared to the C++ signature.  This is the unique identifier
+  the dispatcher uses to find an operator.
+* ``dispatch`` and ``default`` are boolean fields that provide information about what native PyTorch kernels
+  can do, thus implies whether it's required for backend extenders to implement the kernel.
+  More details can be found in :ref:`register kernels for the new backend<register-kernel>`.
+
+
+.. _register-kernel:
+
+Register kernels for the new backend
+------------------------------------
+
+To register your kernels to PyTorch dispatcher, you can use the
+``TORCH_LIBRARY_IMPL`` API described in
+`Registering a Dispatched Operator in C++ <dispatcher>`_:
+
+.. code-block:: cpp
+
+  TORCH_LIBRARY_IMPL(aten, PrivateUse1, m) {
+    m.impl(<schema_my_op1>, &my_op1);
+    m.impl(<schema_my_op2>, &my_op2);
+    m.impl(<schema_my_op2_backward>, &my_op2_backward);
+  }
+
+Now let's zoom in and what operator requires a kernel from a customized backend and what's
+inside the kernels exactly.
+
+PyTorch currently has more than 1600 operators and it’s still growing.  It’s unrealistic
+for backend extensions to keep up with this speed.  Even for native backends like CPU
+or CUDA, it often requires a lot of work to write dedicated kernels for every new op.
+
+Fortunately, some native PyTorch kernels are written in a way that they decompose to
+combination of several known operators. In other words, you only need to implement
+a set of known operators (ops that require registration below) instead of all PyTorch operators.
+
+PyTorch operators can be classified into two categories:
+
+* Ops that require registration: PyTorch native implementation for these ops is backend specific
+  and thus it’s required to provide a kernel for customized backend.  Otherwise calling such op
+  on the customized backend will error out.
+    * In ``RegistrationDeclarations.h`` these operators have ``dispatch`` set to True *and* ``default`` set to False
+      in the metadata found in their accompanying comments.
+
+
+* Registration is optional: backend extenders can skip registering to these ops without sacrificing any support.
+  However, if a backend extender wants to override the default kernel provided by PyTorch, they can still
+  register their customized kernel to their backend and the dispatcher will use it for your backend only.
+  For example, current implementation of PyTorch's ``max_pool2d`` returns ``indices`` as part of forward outputs which
+  creates overhead in torch_xla, so torch_xla registers its own kernel for ``max_pool2d`` instead.
+    * In ``RegistrationDeclarations.h`` these operators have ``dispatch`` set to False *or* ``default`` set to True
+      in the metadata found in their accompanying comments.
+
+
+
+Autograd support for the new backend
+------------------------------------
+
+Gradient formulas are mostly purely mathematical and thus are general for all backends.
+PyTorch often registers a kernel to alias dispatch key Autograd, which means it can be used by all backends.
+
+For these operators you don't have to worry about their derivative formulas,
+you can just write forward definitions for operators in ``RegistrationDeclarations.h`` and PyTorch handles
+backward for you automatically.
+
+.. code-block:: cpp
+
+
+  Tensor my_op1(const Tensor& self, const Tensor& other) {
+    // call your backend-specific APIs to implement my_op so that
+    // it matches PyTorch's native behavior
+  }
+  TORCH_LIBRARY_IMPL(aten, PrivateUse1, m) {
+    m.impl(<schema_my_op1>, &my_op);
+  }
+
+
+In some cases, PyTorch backward kernel implementations are also device specific so that they can squeeze out
+max performance out of each backend. For those operators you’ll see op_backward showing up in
+``RegistrationDeclarations.h`` as *required registration* as well.
+
+.. code-block:: cpp
+
+
+  Tensor my_op2_backward(const Tensor& self, const Tensor& other) {
+    // call your backend-specific APIs to implement my_op2_backward so that
+    // it matches PyTorch's native behavior
+  }
+
+  // Note backward kernel is still registered to PrivateUse1 instead of AutogradPrivateUse1.
+  // PyTorch will wrap your backward kernel with proper autograd setup and then link to it in
+  // my_op2's AutogradPrivateUse1 kernel.
+  TORCH_LIBRARY_IMPL(aten, PrivateUse1, m) {
+    m.impl(<schema_my_op2>, &my_op2);
+    m.impl(<schema_my_op2_backward>, &my_op2_backward);
+  }
+
+
+In a few *rare* cases, PyTorch’s gradient formula for certain operators may have assumptions that don’t generalize
+for all backends. In those cases backend extenders can optionally override PyTorch Autograd layer by registering
+a kernel from torch::autograd::Function to the corresponding dispatch key (for example, AutogradPrivateUse1 if
+you're using PrivateUse1 for your backend):
+
+
+.. code-block:: cpp
+
+
+  class MyAddFunction : public torch::autograd::Function<MyAddFunction> {
+    public:
+    static Tensor forward(AutogradContext *ctx, torch::Tensor self, torch::Tensor other) {
+      at::AutoNonVariableTypeMode g;
+      return myadd(self, other);
+    }
+
+    static tensor_list backward(AutogradContext *ctx, tensor_list grad_outputs) {
+      auto grad_output = grad_outputs[0];
+      return {grad_output, grad_output};
+    }
+  };
+
+  Tensor myadd_autograd(const Tensor& self, const Tensor& other) {
+    return MyAddFunction::apply(self, other)[0];
+  }
+
+  // Register the autograd kernel to AutogradPrivateUse1
+  TORCH_LIBRARY_IMPL(aten, AutogradPrivateUse1, m) {
+    m.impl(<myadd_schema>, &myadd_autograd);
+  }
+
+  // Register the inference kernel to PrivateUse1
+  TORCH_LIBRARY_IMPL(aten, PrivateUse1, m) {
+    m.impl(<myadd_schema>, &myadd);
+  }
+
+
+
+With this trick you have full control over both training and inference behavior for ``my_add`` operator in your backend.
+Here's `an example <https://github.com/pytorch/xla/blob/r1.7/torch_xla/csrc/aten_autograd_ops.h>`_ in the ``pytorch/xla`` repository.
+
+
+Build an extension
+------------------
+
+Out-of-tree backend is supported by adding a C++ extension to PyTorch.
+Once you have kernels and registrations ready, you can build a C++ extension by
+writing a ``setup.py`` script that uses ``setuptools`` to compile C++ code.  Here's a simplified example from
+`pytorch/xla repo <https://github.com/pytorch/xla/blob/master/setup.py>`_::
+
+  from setuptools import setup
+  from torch.utils.cpp_extension import BuildExtension, CppExtension
+
+  setup(
+      name='torch_xla',
+      ext_modules=[
+          CppExtension(
+              '_XLAC',
+              torch_xla_sources,
+              include_dirs=include_dirs,
+              extra_compile_args=extra_compile_args,
+              library_dirs=library_dirs,
+              extra_link_args=extra_link_args + \
+                  [make_relative_rpath('torch_xla/lib')],
+          ),
+      ],
+      cmdclass={
+          'build_ext': Build,  # Build is a derived class of BuildExtension
+      }
+      # more configs...
+  )
+
+
+See `our C++ extension tutorial <https://pytorch.org/tutorials/advanced/cpp_extension.html#building-with-setuptools>`_
+for more details.
+
+
+Custom operator support
+-----------------------
+
+Your new backend should work seamlessly with
+`customized operators extended in python <https://pytorch.org/docs/stable/notes/extending.html>`_
+without writing any new kernels as long as the customized operator is composed of existing
+PyTorch operators (which are already supported by your backend).
+
+For `custom operators extended in C++ <cpp_autograd>`_ they often come with a
+`backend specific C++ kernel implementation e.g. nms kernel in torchvsion <https://github.com/pytorch/vision/blob/master/torchvision/csrc/ops/cuda/nms_kernel.cu>`_
+as well as `a customized Python API e.g. torch.ops.torchvision.nms <https://github.com/pytorch/vision/blob/master/torchvision/csrc/ops/nms.cpp#L18>`_.
+To support these operators, backend extenders will need to write a C++ kernel for your backend and properly
+register it to the corresponding namespace in the dispatcher similar to supporting PyTorch native operators.
+Alternatively you could also add a customized API in your extension e.g ``torch_xla.core.functions.nms`` for
+these adhoc requests.
+
+JIT support
+-----------
+
+As we mentioned in `Registering a Dispatched Operator in C++ <dispatcher>`_, kernels registered through `m.impl()` API
+support being called in both unboxed and boxed ways. In other words your customized backend can also work with our
+JIT tracing/scripting frontend just like the in-tree backends like CPU or CUDA do.  You could potentially also write specialized optimization
+passes for your backend on a JIT graph.  But we will not discuss it here since we haven't finalized the integration point
+in JIT, so the current backend support will focus on the eager frontend for now.
+
+
+Testing your backend against native PyTorch backends
+----------------------------------------------------
+
+PyTorch lets tests run on multiple device types using its `generic device type testing framework <https://github.com/pytorch/pytorch/blob/master/torch/testing/_internal/common_device_type.py>`_.
+You can find details about `how tests use it <https://github.com/pytorch/pytorch/blob/5a8198eb3c594aa18352930fd21f3c25bd7b7100/torch/testing/_internal/common_device_type.py#L23>`_
+and information about `how to add a new device type <https://github.com/pytorch/pytorch/blob/5a8198eb3c594aa18352930fd21f3c25bd7b7100/torch/testing/_internal/common_device_type.py#L369>`_.
+Once added, PyTorch tests using the generic device type testing framework will be run using your device type, too.
+See `this Wiki page <https://github.com/pytorch/pytorch/wiki/Writing-tests-that-run-on-all-available-device-types>`_ for an example of how tests are instantiated.
+
+Running PyTorch’s existing test suites with your device type is important to ensure correctness,
+but not all PyTorch features are supported by every device type.  The generic device type testing
+framework allows for considerable customization so that device types can select which tests to run,
+which dtypes they support, and even which precisions to use when comparing tensors for equality.
+
+An example device type that uses the generic device type testing framework and doesn’t ship with
+PyTorch is XLA.  See `its extension of the generic device type testing framework <https://github.com/pytorch/xla/blob/master/test/pytorch_test_base.py>`_,
+which contains examples of block listing tests, block listing dtypes, and overriding test precision.
+
+The generic device type testing framework is actively developed. To request a feature please file an
+issue on PyTorch’s Github.
+
+
+Backward Compatibility
+----------------------
+
+Currently PyTorch can’t guarantee backward compatibility for registered operators.
+Operators, as well as their schemas, might be added/modified/deleted as needed.  Registered
+kernels must be *exactly* the same as PyTorch version.  If PyTorch adds more parameters (
+even with defaults) for an operator, your old registration won't work until it's updated
+to match PyTorch's new signature.
+
+As a result, we *highly recommend* out-of-tree backend extenders only sync with major PyTorch
+releases to minimize interruptions in development.  PyTorch is on a quarterly release cadence.
+Backend extenders should join the *#announcement* channel at `pytorch.slack.com <http://pytorch.slack.com/>`_
+to get latest updates on releases.
+
+Known issues & additional notes
+-------------------------------
+
+*  Not all test suites are device generic yet. Extensible test classes can be found by searching
+   ``instantiate_device_type_tests`` in PyTorch codebase, e.g
+   ``TestTorchDeviceType, TestViewOps, TestTensorDeviceOps, TestTypePromotion`` etc.
+* There's no extension point in C++ for serializing a python Tensor object on customized backend. Currently
+  you can only extend it by modifying `PyTorch Tensor __reduce_ex__ method <https://github.com/pytorch/pytorch/blob/5640b79bf8a5412a0209a919c05c811d5427cc12/torch/tensor.py#L83-L150>`_
+  or monkey patching in out-of-tree repository.
+* If your backend doesn't allow direct memory access, you should pay additional attention to supporting
+  view ops since they're supposed to share storage. Changes to view tensor need to propagated to its
+  base tensor and vice versa.
+* There's no extension point in C++ for Optimizer if your backend doesn't work with the native PyTorch
+  Optimizers, e.g. need to carry the states to be updated in backward like torch-xla. Such use cases
+  currently can only be done through adding customized API or monkey patching in out-of-tree repository.
+
+Future Work
+-----------
+
+Making every component in PyTorch extensible for an out-of-tree backend seamless
+requires a lot of changes to PyTorch internals.  Here are a few items that we're
+actively working on might improve the experience in the future:
+
+* Improve test coverage of generic testing framework.
+* Improve ``Math`` kernel coverage and more comprehensive tests to make sure ``Math``
+  kernel behavior matches other backends like ``CPU/CUDA``.
+* Refactor ``RegistrationDeclarations.h`` to carry the minimal information and reuse
+  PyTorch's codegen as much as possible.
+* Support a backend fallback kernel to automatic convert inputs to CPU and convert the
+  result back to the customized backend. This will allow "full" operator coverage even
+  though you don't have kernels written for every operator.
+
+
+Stay in touch
+-------------
+
+Please use `PyTorch dev discussions <https://dev-discuss.pytorch.org/>`_ for questions and discussions. If you have
+any feature requests or bug reports, please `file an issue on github <https://github.com/pytorch/pytorch/issues>`_.
+
+If you're interested in helping in any of the future work items above (e.g adding more ``Math``
+kernels for PyTorch operators in C++), please reach out to us through Github or Slack!
+
diff --git a/advanced_source/generic_join.rst b/advanced_source/generic_join.rst
new file mode 100644
index 00000000000..0fb0d5528d2
--- /dev/null
+++ b/advanced_source/generic_join.rst
@@ -0,0 +1,450 @@
+Distributed Training with Uneven Inputs Using the Join Context Manager
+======================================================================
+
+**Author**\ : `Andrew Gu <https://github.com/andwgu>`_
+
+.. note::
+   |edit| View and edit this tutorial in `github <https://github.com/pytorch/tutorials/blob/main/advanced_source/generic_join.rst>`__.
+
+.. note:: ``Join`` is introduced in PyTorch 1.10 as a prototype feature. This
+    API is subject to change.
+
+In this tutorial, you will see:
+
+- An overview of the `Join`_ context manager.
+- An example of how to use the context manager with ``DistributedDataParallel``.
+- An example of how to use the context manager with both
+  ``DistributedDataParallel`` and ``ZeroRedundancyOptimizer``.
+- An example of passing in keyword arguments to the context manager.
+- A dive into how the `Join`_ context manager works.
+- An example showing how to make a toy class compatible with the context
+  manager.
+
+Requirements
+------------
+
+- PyTorch 1.10+
+- `Getting Started with Distributed Data Parallel`_
+- `Shard Optimizer States with ZeroRedundancyOptimizer`_
+
+What is ``Join``?
+-----------------
+In `Getting Started with Distributed Data Parallel - Basic Use Case`_, you saw
+the general skeleton for using `DistributedDataParallel`_ to perform data
+parallel training. This implicitly schedules all-reduces in each backward pass
+to synchronize gradients across ranks. Such `collective communications
+<https://pytorch.org/docs/stable/distributed.html>`__ require participation
+from all ranks in the process group, so if a rank has fewer inputs, then the
+other ranks will hang or error (depending on the backend). More generally, this
+problem persists for any class that performs per-iteration synchronous
+collective communications.
+
+``Join`` is a context manager to be used around your per-rank training loop to
+facilitate training with uneven inputs. The context manager allows the ranks
+that exhaust their inputs early (i.e. *join* early) to shadow the collective
+communications performed by those that have not yet joined. The ways in which
+the communications are shadowed are specified by hooks.
+
+Using ``Join`` with ``DistributedDataParallel``
+-----------------------------------------------
+PyTorch's `DistributedDataParallel`_ works out-of-the-box with the ``Join``
+context manager. Here is an example usage:
+
+::
+
+    import os
+    import torch
+    import torch.distributed as dist
+    import torch.multiprocessing as mp
+    from torch.distributed.algorithms.join import Join
+    from torch.nn.parallel import DistributedDataParallel as DDP
+
+    BACKEND = "nccl"
+    WORLD_SIZE = 2
+    NUM_INPUTS = 5
+
+    def worker(rank):
+        os.environ['MASTER_ADDR'] = 'localhost'
+        os.environ['MASTER_PORT'] = '29500'
+        dist.init_process_group(BACKEND, rank=rank, world_size=WORLD_SIZE)
+
+        model = DDP(torch.nn.Linear(1, 1).to(rank), device_ids=[rank])
+        # Rank 1 gets one more input than rank 0
+        inputs = [torch.tensor([1]).float() for _ in range(NUM_INPUTS + rank)]
+
+        num_inputs = 0
+        with Join([model]):
+            for input in inputs:
+                num_inputs += 1
+                loss = model(input).sum()
+                loss.backward()
+
+        print(f"Rank {rank} has exhausted all {num_inputs} of its inputs!")
+
+    def main():
+        mp.spawn(worker, nprocs=WORLD_SIZE, join=True)
+
+    if __name__ == "__main__":
+        main()
+
+This produces the following output (where the ``print()`` s from rank 0 and
+rank 1 may be arbitrarily ordered):
+
+::
+
+  Rank 0 has exhausted all 5 of its inputs!
+  Rank 1 has exhausted all 6 of its inputs!
+
+.. note::
+    `DistributedDataParallel`_ provided its own `join()`_ context manager
+    prior to the introduction of this generic ``Join`` context manager. In the
+    above example, using ``with Join([model]):`` is equivalent to using
+    ``with model.join():``. One limitation of the existing
+    ``DistributedDataParallel.join()`` is that it does not allow multiple
+    participating classes, e.g. ``DistributedDataParallel`` and
+    `ZeroRedundancyOptimizer`_ together.
+
+Using ``Join`` with ``DistributedDataParallel`` and ``ZeroRedundancyOptimizer``
+-------------------------------------------------------------------------------
+The ``Join`` context manager works not only with a single class but also with
+multiple classes together. PyTorch's ``ZeroRedundancyOptimizer`` is also
+compatible with the context manager, so here, we examine how to modify the
+previous example to use both ``DistributedDataParallel`` and
+``ZeroRedundancyOptimizer``:
+
+::
+
+    from torch.distributed.optim import ZeroRedundancyOptimizer as ZeRO
+    from torch.optim import Adam
+
+    def worker(rank):
+        os.environ['MASTER_ADDR'] = 'localhost'
+        os.environ['MASTER_PORT'] = '29500'
+        dist.init_process_group(BACKEND, rank=rank, world_size=WORLD_SIZE)
+
+        model = DDP(torch.nn.Linear(1, 1).to(rank), device_ids=[rank])
+        optim = ZeRO(model.parameters(), Adam, lr=0.01)
+        # Rank 1 gets one more input than rank 0
+        inputs = [torch.tensor([1]).float() for _ in range(NUM_INPUTS + rank)]
+
+        num_inputs = 0
+        # Pass both `model` and `optim` into `Join()`
+        with Join([model, optim]):
+            for input in inputs:
+                num_inputs += 1
+                loss = model(input).sum()
+                loss.backward()
+                optim.step()
+
+        print(f"Rank {rank} has exhausted all {num_inputs} of its inputs!")
+
+This will yield the same output as before. The notable change was
+additionally passing in the ``ZeroRedundancyOptimizer`` instance into
+``Join()``.
+
+Passing Keyword Arguments
+-------------------------
+Classes may provide keyword arguments that modify their behavior in the context
+manager at run time. For example, ``DistributedDataParallel`` provides an
+argument ``divide_by_initial_world_size``, which determines if gradients are
+divided by the initial world size or by the effective world size (i.e. number
+of non-joined ranks). Such keyword arguments can be passed directly into the
+context manager.
+
+::
+
+    with Join([model, optim], divide_by_initial_world_size=False):
+        for input in inputs:
+            ...
+
+.. warning::
+    The keyword arguments passed into the context manager are shared across
+    all participating classes. This should not be a limitation since we do
+    not expect cases where multiple ``Joinable`` s need differing settings
+    of the same argument. Nonetheless, this is something to keep in mind.
+
+How Does ``Join`` Work?
+-----------------------
+Now that we have seen some preliminary examples of how to use the ``Join``
+context manager, let us delve deeper into how it works. This will provide a
+greater insight into the full capability that it offers and prepare you to make
+your own custom classes compatible. Here, we will go over the ``Join`` class as
+well as the supporting classes ``Joinable`` and ``JoinHook``.
+
+``Joinable``
+^^^^^^^^^^^^
+
+To begin, classes compatible with the ``Join`` context manager must inherit
+from the abstract base class ``Joinable``. In particular, a ``Joinable`` must
+implement:
+
+- ``join_hook(self, **kwargs) -> JoinHook``
+
+This returns the ``JoinHook`` instance for the ``Joinable``, determining how
+joined processes should shadow the per-iteration collective communications
+performed by the ``Joinable``.
+
+- ``join_device(self) -> torch.device``
+
+This returns a device to be used by the ``Join`` context manager to perform
+collective communications, e.g. ``torch.device("cuda:0")`` or
+``torch.device("cpu")``.
+
+- ``join_process_group(self) -> ProcessGroup``
+
+This returns the process group to be used by the ``Join`` context manager to
+perform collective communications.
+
+In particular, the ``join_device`` and ``join_process_group`` are required
+attributes to ensure that the context manager can schedule collective
+communications between joined and non-joined processes. One usage is to count
+the number of non-joined processes on each iteration using an all-reduce.
+Another usage is for implementing the mechanism required for
+``throw_on_early_termination=True``, which we will explain later below.
+
+``DistributedDataParallel`` and ``ZeroRedundancyOptimizer`` already inherit
+from ``Joinable`` and implement the above methods, which is why we could
+directly use them in the previous examples.
+
+``Joinable`` classes should make sure to call the ``Joinable`` constructor
+since it initializes a ``JoinConfig`` instance, which is used internally by
+the context manager to ensure correctness. This will be saved in each
+``Joinable`` as a field ``_join_config``.
+
+``JoinHook``
+^^^^^^^^^^^^
+
+Next, let us break down the ``JoinHook`` class. A ``JoinHook`` provides two
+entry points into a context manager:
+
+- ``main_hook(self) -> None``
+
+This hook is called repeatedly by each joined rank while there exists a rank
+that has not yet joined. It is meant to shadow the collective communications
+performed by the ``Joinable`` in each training iteration (e.g. in one forward
+pass, backward pass, and optimizer step).
+
+- ``post_hook(self, is_last_joiner: bool) -> None``
+
+This hook is called once all ranks have joined. It is passed an additional
+``bool`` argument ``is_last_joiner``, which indicates if the rank was one of
+the last to join. The argument may be useful for synchronization.
+
+To give concrete examples of what these hooks may look like, the provided
+``ZeroRedundancyOptimizer`` main hook performs an optimizer step per normal
+since the joined rank is still responsible for updating and synchronizing its
+shard of the parameters, and the provided ``DistributedDataParallel`` post-hook
+broadcasts the final updated model from one of the last joining ranks to ensure
+that it is the same across all ranks.
+
+``Join``
+^^^^^^^^
+
+Finally, let us examine how these fit into the ``Join`` class itself.
+
+- ``__init__(self, joinables: List[Joinable], enable: bool = True, throw_on_early_termination: bool = False)``
+
+As we saw in the previous examples, the constructor takes in a list of the
+``Joinable`` s that participate in the training loop. These should be the
+classes that perform collective communications in each iteration.
+
+``enable`` is a ``bool`` that can be set to ``False`` if you know that there
+will not be uneven inputs, in which case the context manager becomes vacuous
+similar to ``contextlib.nullcontext()``. This also may disable join-related
+computation in the participating ``Joinable`` s.
+
+``throw_on_early_termination`` is a ``bool`` that can be set to ``True`` to
+have each rank raise an exception the moment that uneven inputs are detected.
+This is useful for cases that do not conform to the context manager's
+requirements, which is most typically when there are collective communications
+from different classes that may be arbitrarily interleaved, such as when using
+``DistributedDataParallel`` with a model that has ``SyncBatchNorm`` layers. In
+such cases, this argument should be set to ``True`` so that the application
+logic can catch the exception and determine how to proceed.
+
+- The core logic occurs in the ``__exit__()`` method, which loops while there
+  exists a non-joined rank, calling each ``Joinable`` 's main hook, and
+  then once all ranks have joined, calls their post hooks. Both the main hooks
+  and post-hooks are iterated over in the order that the ``Joinable`` s are
+  passed in.
+
+- The context manager requires a heartbeat from non-joined processes. As such,
+  each ``Joinable`` class should make a call to ``Join.notify_join_context()``
+  before its per-iteration collective communications. The context manager will
+  ensure that only the first ``Joinable`` passed in actually sends the
+  heartbeat.
+
+.. warning:: As mentioned above regarding ``throw_on_early_termination``, the
+    ``Join`` context manager is not compatible with certain compositions of
+    classes. The ``Joinable`` 's ``JoinHook`` s must be serializable since each
+    hook is fully executed before proceeding to the next. In other words, two
+    hooks cannot overlap. Moreover, currently, both the main hooks and post-
+    hooks are iterated over in the same deterministic order. If this appears to
+    be a major limitation, we may modify the API to permit a customizable
+    ordering.
+
+Making a Toy Class Work with ``Join``
+-------------------------------------
+Since the previous section introduced several concepts, let us see them in
+practice with a toy example. Here, we will implement a class that counts the
+number of inputs that are seen across all ranks before its rank joins. This
+should provide a basic idea of how you may make your own class compatible
+with the ``Join`` context manager.
+
+Specifically, the following code has each rank print out (1) the number of
+inputs across all ranks that seen before it joins and (2) the total number
+of inputs across all ranks.
+
+::
+
+    import os
+    import torch
+    import torch.distributed as dist
+    import torch.multiprocessing as mp
+    from torch.distributed.algorithms.join import Join, Joinable, JoinHook
+
+    BACKEND = "nccl"
+    WORLD_SIZE = 2
+    NUM_INPUTS = 5
+
+    class CounterJoinHook(JoinHook):
+        r"""
+        Join hook for :class:`Counter`.
+
+        Arguments:
+            counter (Counter): the :class:`Counter` object using this hook.
+            sync_max_count (bool): whether to sync the max count once all ranks
+                join.
+        """
+        def __init__(
+            self,
+            counter,
+            sync_max_count
+        ):
+            self.counter = counter
+            self.sync_max_count = sync_max_count
+
+        def main_hook(self):
+            r"""
+            Shadows the counter's all-reduce by all-reducing a dim-1 zero tensor.
+            """
+            t = torch.zeros(1, device=self.counter.device)
+            dist.all_reduce(t)
+
+        def post_hook(self, is_last_joiner: bool):
+            r"""
+            Synchronizes the max count across all :class:`Counter` s if
+            ``sync_max_count=True``.
+            """
+            if not self.sync_max_count:
+                return
+            rank = dist.get_rank(self.counter.process_group)
+            common_rank = self.counter.find_common_rank(rank, is_last_joiner)
+            if rank == common_rank:
+                self.counter.max_count = self.counter.count.detach().clone()
+            dist.broadcast(self.counter.max_count, src=common_rank)
+
+    class Counter(Joinable):
+        r"""
+        Example :class:`Joinable` that counts the number of training iterations
+        that it participates in.
+        """
+        def __init__(self, device, process_group):
+            super(Counter, self).__init__()
+            self.device = device
+            self.process_group = process_group
+            self.count = torch.tensor([0], device=device).float()
+            self.max_count = torch.tensor([0], device=device).float()
+
+        def __call__(self):
+            r"""
+            Counts the number of inputs processed on this iteration by all ranks
+            by all-reducing a dim-1 one tensor; increments its own internal count.
+            """
+            Join.notify_join_context(self)
+            t = torch.ones(1, device=self.device).float()
+            dist.all_reduce(t)
+            self.count += t
+
+        def join_hook(self, **kwargs) -> JoinHook:
+            r"""
+            Return a join hook that shadows the all-reduce in :meth:`__call__`.
+
+            This join hook supports the following keyword arguments:
+                sync_max_count (bool, optional): whether to synchronize the maximum
+                    count across all ranks once all ranks join; default is ``False``.
+            """
+            sync_max_count = kwargs.get("sync_max_count", False)
+            return CounterJoinHook(self, sync_max_count)
+
+        @property
+        def join_device(self) -> torch.device:
+            return self.device
+
+        @property
+        def join_process_group(self):
+            return self.process_group
+
+        def find_common_rank(self, rank, to_consider):
+            r"""
+            Returns the max rank of the ones to consider over the process group.
+            """
+            common_rank = torch.tensor([rank if to_consider else -1], device=self.device)
+            dist.all_reduce(common_rank, op=dist.ReduceOp.MAX, group=self.process_group)
+            common_rank = common_rank.item()
+            return common_rank
+
+    def worker(rank):
+        assert torch.cuda.device_count() >= WORLD_SIZE
+        os.environ['MASTER_ADDR'] = 'localhost'
+        os.environ['MASTER_PORT'] = '29500'
+        dist.init_process_group(BACKEND, rank=rank, world_size=WORLD_SIZE)
+
+        counter = Counter(torch.device(f"cuda:{rank}"), dist.group.WORLD)
+        inputs = [torch.tensor([1]).float() for _ in range(NUM_INPUTS + rank)]
+
+        with Join([counter], sync_max_count=True):
+            for _ in inputs:
+                counter()
+
+        print(f"{int(counter.count.item())} inputs processed before rank {rank} joined!")
+        print(f"{int(counter.max_count.item())} inputs processed across all ranks!")
+
+    def main():
+        mp.spawn(worker, nprocs=WORLD_SIZE, join=True)
+
+    if __name__ == "__main__":
+        main()
+
+Since rank 0 sees 5 inputs and rank 1 sees 6, this yields the output:
+
+::
+
+    10 inputs processed before rank 0 joined!
+    11 inputs processed across all ranks!
+    11 inputs processed before rank 1 joined!
+    11 inputs processed across all ranks!
+
+Some key points to highlight:
+
+- A ``Counter`` instance performs a single all-reduce per iteration, so the
+  main hook performs a single all-reduce as well to shadow it.
+
+- The ``Counter`` class makes a call to ``Join.notify_join_context()`` at the
+  beginning of its ``__call__()`` method since that is a place before its per-
+  iteration collective communications (i.e. its all-reduce).
+
+- The ``is_last_joiner`` argument is used to determine the broadcast source in
+  the post-hooks.
+
+- We pass in the ``sync_max_count`` keyword argument to the context manager,
+  which is then forwarded to ``Counter`` 's join hook.
+
+
+.. _Join: https://pytorch.org/docs/master/distributed.algorithms.join.html
+.. _Getting Started with Distributed Data Parallel: https://pytorch.org/tutorials/intermediate/ddp_tutorial.html
+.. _Getting Started with Distributed Data Parallel - Basic Use Case: https://pytorch.org/tutorials/intermediate/ddp_tutorial.html#basic-use-case
+.. _Shard Optimizer States with ZeroRedundancyOptimizer: https://pytorch.org/tutorials/recipes/zero_redundancy_optimizer.html
+.. _DistributedDataParallel: https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html
+.. _join(): https://docs.pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel.join
+.. _ZeroRedundancyOptimizer: https://pytorch.org/docs/stable/distributed.optim.html
diff --git a/advanced_source/neural_style_tutorial.py b/advanced_source/neural_style_tutorial.py
index 79e63989fd7..b4ab10ef01d 100644
--- a/advanced_source/neural_style_tutorial.py
+++ b/advanced_source/neural_style_tutorial.py
@@ -1,234 +1,134 @@
-# -*- coding: utf-8 -*-
 """
-Neural Transfer with PyTorch
-============================
+Neural Transfer Using PyTorch
+=============================
+
+
 **Author**: `Alexis Jacq <https://alexis-jacq.github.io>`_
+ 
+**Edited by**: `Winston Herring <https://github.com/winston6>`_
 
 Introduction
 ------------
 
-Welcome! This tutorial explains how to impletment the
-`Neural-Style <https://arxiv.org/abs/1508.06576>`__ algorithm developed
-by Leon A. Gatys, Alexander S. Ecker and Matthias Bethge.
-
-Neural what?
-~~~~~~~~~~~~
-
-The Neural-Style, or Neural-Transfer, is an algorithm that taks as
-input a content-image (e.g. a tortle), a style-image (e.g. artistic
-waves) and return the content of the content-image as if it was
-'painted' using the artistic style of the style-image:
+This tutorial explains how to implement the `Neural-Style algorithm <https://arxiv.org/abs/1508.06576>`__
+developed by Leon A. Gatys, Alexander S. Ecker and Matthias Bethge.
+Neural-Style, or Neural-Transfer, allows you to take an image and
+reproduce it with a new artistic style. The algorithm takes three images,
+an input image, a content-image, and a style-image, and changes the input
+to resemble the content of the content-image and the artistic style of the style-image.
 
+ 
 .. figure:: /_static/img/neural-style/neuralstyle.png
    :alt: content1
-
-How does it work?
-~~~~~~~~~~~~~~~~~
-
-The principe is simple: we define two distances, one for the content
-(:math:`D_C`) and one for the style (:math:`D_S`). :math:`D_C` measues
-how different is the content between two images, while :math:`D_S`
-measures how different is the style between two images. Then, we take a
-third image, the input, (e.g. a with noise), and we transform it in
-order to both minimize its content-distance with the content-image and
-its style-distance with the style-image.
-
-OK. How does it work?
-^^^^^^^^^^^^^^^^^^^^^
-
-Well, going further requires some mathematics. Let :math:`C_{nn}` be a
-pre-trained deep convolutional neural network and :math:`X` be any
-image. :math:`C_{nn}(X)` is the network fed by :math:`X` (containing
-feature maps at all layers). Let :math:`F_{XL} \in C_{nn}(X)` be the
-feature maps at depth layer :math:`L`, all vectorized and concatenated
-in one single vector. We simply define the content of :math:`X` at layer
-:math:`L` by :math:`F_{XL}`. Then, if :math:`Y` is another image of same
-the size than :math:`X`, we define the distance of content at layer
-:math:`L` as follow:
-
-.. math:: D_C^L(X,Y) = \|F_{XL} - F_{YL}\|^2 = \sum_i (F_{XL}(i) - F_{YL}(i))^2
-
-Where :math:`F_{XL}(i)` is the :math:`i^{th}` element of :math:`F_{XL}`.
-The style is a bit less trivial to define. Let :math:`F_{XL}^k` with
-:math:`k \leq K` be the vectorized :math:`k^{th}` of the :math:`K`
-feature maps at layer :math:`L`. The style :math:`G_{XL}` of :math:`X`
-at layer :math:`L` is defined by the Gram produce of all vectorized
-feature maps :math:`F_{XL}^k` with :math:`k \leq K`. In other words,
-:math:`G_{XL}` is a :math:`K`\ x\ :math:`K` matrix and the element
-:math:`G_{XL}(k,l)` at the :math:`k^{th}` line and :math:`l^{th}` column
-of :math:`G_{XL}` is the vectorial produce between :math:`F_{XL}^k` and
-:math:`F_{XL}^l` :
-
-.. math::
-
-    G_{XL}(k,l) = \langle F_{XL}^k, F_{XL}^l\\rangle = \sum_i F_{XL}^k(i) . F_{XL}^l(i)
-
-Where :math:`F_{XL}^k(i)` is the :math:`i^{th}` element of
-:math:`F_{XL}^k`. We can see :math:`G_{XL}(k,l)` as a measure of the
-correlation between feature maps :math:`k` and :math:`l`. In that way,
-:math:`G_{XL}` represents the correlation matrix of feature maps of
-:math:`X` at layer :math:`L`. Note that the size of :math:`G_{XL}` only
-depends on the number of feature maps, not on the size of :math:`X`.
-Then, if :math:`Y` is another image *of any size*, we define the
-distance of style at layer :math:`L` as follow:
-
-.. math:: 
-
-    D_S^L(X,Y) = \|G_{XL} - G_{YL}\|^2 = \sum_{k,l} (F_{XL}(k,l) - F_{YL}(k,l))^2
-
-In order to minimize in one shot :math:`D_C(X,C)` between a variable
-image :math:`X` and target content-image :math:`C` and :math:`D_S(X,S)`
-between :math:`X` and target style-image :math:`S`, both computed at
-several layers , we compute and sum the gradients (derivative with
-respect to :math:`X`) of each distance at each wanted layer:
-
-.. math::
-
-    \\nabla_{\textit{total}}(X,S,C) = \sum_{L_C} w_{CL_C}.\\nabla_{\textit{content}}^{L_C}(X,C) + \sum_{L_S} w_{SL_S}.\\nabla_{\textit{style}}^{L_S}(X,S)
-
-Where :math:`L_C` and :math:`L_S` are respectivement the wanted layers
-(arbitrary stated) of content and style and :math:`w_{CL_C}` and
-:math:`w_{SL_S}` the weights (arbitrary stated) associated with the
-style or the content at each wanted layer. Then, we run a gradient
-descent over :math:`X`:
-
-.. math:: X \leftarrow X - \\alpha \\nabla_{\textit{total}}(X,S,C)
-
-Ok. That's enough with maths. If you want to go deeper (how to compute
-the gradients) **we encourage you to read the original paper** by Leon
-A. Gatys and AL, where everything is much better and much clearer
-explained. 
-
-For our implementation in PyTorch, we already have everything
-we need: indeed, with PyTorch, all the gradients are automatically and
-dynamically computed for you (while you use functions from the library).
-This is why the implementation of this algorithm becomes very
-confortable with PyTorch.
-
-PyTorch implementation
-----------------------
-
-If you are not sure to understand all the mathematics above, you will
-probably get it by implementing it. If you are discovering PyTorch, we
-recommend you to first read this :doc:`Introduction to
-PyTorch </beginner/deep_learning_60min_blitz>`.
-
-Packages
-~~~~~~~~
-
-We will have recourse to the following packages:
-
--  ``torch``, ``torch.nn``, ``numpy`` (indispensables packages for
-   neural networks with PyTorch)
--  ``torch.autograd.Variable`` (dynamic computation of the gradient wrt
-   a variable)
--  ``torch.optim`` (efficient gradient descents)
--  ``PIL``, ``PIL.Image``, ``matplotlib.pyplot`` (load and display
-   images)
--  ``torchvision.transforms`` (treat PIL images and transform into torch
-   tensors)
--  ``torchvision.models`` (train or load pre-trained models)
--  ``copy`` (to deep copy the models; system package)
 """
 
-from __future__ import print_function
+######################################################################
+# Underlying Principle
+# --------------------
+# 
+# The principle is simple: we define two distances, one for the content
+# (:math:`D_C`) and one for the style (:math:`D_S`). :math:`D_C` measures how different the content
+# is between two images while :math:`D_S` measures how different the style is
+# between two images. Then, we take a third image, the input, and
+# transform it to minimize both its content-distance with the
+# content-image and its style-distance with the style-image. Now we can
+# import the necessary packages and begin the neural transfer.
+# 
+# Importing Packages and Selecting a Device
+# -----------------------------------------
+# Below is a  list of the packages needed to implement the neural transfer.
+#
+# -  ``torch``, ``torch.nn``, ``numpy`` (indispensables packages for
+#    neural networks with PyTorch)
+# -  ``torch.optim`` (efficient gradient descents)
+# -  ``PIL``, ``PIL.Image``, ``matplotlib.pyplot`` (load and display
+#    images)
+# -  ``torchvision.transforms`` (transform PIL images into tensors)
+# -  ``torchvision.models`` (train or load pretrained models)
+# -  ``copy`` (to deep copy the models; system package)
 
 import torch
 import torch.nn as nn
-from torch.autograd import Variable
+import torch.nn.functional as F
 import torch.optim as optim
 
 from PIL import Image
 import matplotlib.pyplot as plt
 
 import torchvision.transforms as transforms
-import torchvision.models as models
+from torchvision.models import vgg19, VGG19_Weights
 
 import copy
 
 
 ######################################################################
-# Cuda
-# ~~~~
-#
-# If you have a GPU on your computer, it is preferable to run the
-# algorithm on it, especially if you want to try larger networks (like
-# VGG). For this, we have ``torch.cuda.is_available()`` that returns
-# ``True`` if you computer has an available GPU. Then, we can use method
-# ``.cuda()`` that moves allocated proccesses associated with a module
-# from the CPU to the GPU. When we want to move back this module to the
-# CPU (e.g. to use numpy), we use the ``.cpu()`` method. Finally,
-# ``.type(dtype)`` will be use to convert a ``torch.FloatTensor`` into
-# ``torch.cuda.FloatTensor`` to feed GPU processes.
-#
-
-use_cuda = torch.cuda.is_available()
-dtype = torch.cuda.FloatTensor if use_cuda else torch.FloatTensor
+# Next, we need to choose which device to run the network on and import the
+# content and style images. Running the neural transfer algorithm on large
+# images takes longer and will go much faster when running on a GPU. We can
+# use ``torch.cuda.is_available()`` to detect if there is a GPU available.
+# Next, we set the ``torch.device`` for use throughout the tutorial. Also the ``.to(device)``
+# method is used to move tensors or modules to a desired device. 
 
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+torch.set_default_device(device)
 
 ######################################################################
-# Load images
-# ~~~~~~~~~~~
+# Loading the Images
+# ------------------
+#
+# Now we will import the style and content images. The original PIL images have values between 0 and 255, but when
+# transformed into torch tensors, their values are converted to be between
+# 0 and 1. The images also need to be resized to have the same dimensions.
+# An important detail to note is that neural networks from the
+# torch library are trained with tensor values ranging from 0 to 1. If you
+# try to feed the networks with 0 to 255 tensor images, then the activated
+# feature maps will be unable to sense the intended content and style.
+# However, pretrained networks from the Caffe library are trained with 0
+# to 255 tensor images. 
 #
-# In order to simplify the implementation, let's start by importing a
-# style and a content image of the same dimentions. We then scale them to
-# the desired output image size (128 or 512 in the example, depending on gpu
-# availablity) and transform them into torch tensors, ready to feed
-# a neural network:
 #
-# .. Note::
+# .. note::
 #     Here are links to download the images required to run the tutorial:
-#     `picasso.jpg </_static/img/neural-style/picasso.jpg>`__ and
-#     `dancing.jpg </_static/img/neural-style/dancing.jpg>`__. Download these
-#     two images and add them to a directory with name ``images``
-
+#     `picasso.jpg <https://pytorch.org/tutorials/_static/img/neural-style/picasso.jpg>`__ and
+#     `dancing.jpg <https://pytorch.org/tutorials/_static/img/neural-style/dancing.jpg>`__.
+#     Download these two images and add them to a directory
+#     with name ``images`` in your current working directory.
 
 # desired size of the output image
-imsize = 512 if use_cuda else 128  # use small size if no gpu
+imsize = 512 if torch.cuda.is_available() else 128  # use small size if no GPU
 
 loader = transforms.Compose([
-    transforms.Scale(imsize),  # scale imported image
+    transforms.Resize(imsize),  # scale imported image
     transforms.ToTensor()])  # transform it into a torch tensor
 
 
 def image_loader(image_name):
     image = Image.open(image_name)
-    image = Variable(loader(image))
     # fake batch dimension required to fit network's input dimensions
-    image = image.unsqueeze(0)
-    return image
+    image = loader(image).unsqueeze(0)
+    return image.to(device, torch.float)
 
 
-style_img = image_loader("images/picasso.jpg").type(dtype)
-content_img = image_loader("images/dancing.jpg").type(dtype)
+style_img = image_loader("./data/images/neural-style/picasso.jpg")
+content_img = image_loader("./data/images/neural-style/dancing.jpg")
 
 assert style_img.size() == content_img.size(), \
     "we need to import style and content images of the same size"
 
 
 ######################################################################
-# Imported PIL images has values between 0 and 255. Transformed into torch
-# tensors, their values are between 0 and 1. This is an important detail:
-# neural networks from torch library are trained with 0-1 tensor image. If
-# you try to feed the networks with 0-255 tensor images the activated
-# feature maps will have no sense. This is not the case with pre-trained
-# networks from the Caffe library: they are trained with 0-255 tensor
-# images.
-#
-# Display images
-# ~~~~~~~~~~~~~~
-#
-# We will use ``plt.imshow`` to display images. So we need to first
-# reconvert them into PIL images:
-#
+# Now, let's create a function that displays an image by reconverting a 
+# copy of it to PIL format and displaying the copy using 
+# ``plt.imshow``. We will try displaying the content and style images 
+# to ensure they were imported correctly.
 
 unloader = transforms.ToPILImage()  # reconvert into PIL image
 
 plt.ion()
 
 def imshow(tensor, title=None):
-    image = tensor.clone().cpu()  # we clone the tensor to not do changes on it
-    image = image.view(3, imsize, imsize)  # remove the fake batch dimension
+    image = tensor.cpu().clone()  # we clone the tensor to not do changes on it
+    image = image.squeeze(0)      # remove the fake batch dimension
     image = unloader(image)
     plt.imshow(image)
     if title is not None:
@@ -237,323 +137,297 @@ def imshow(tensor, title=None):
 
 
 plt.figure()
-imshow(style_img.data, title='Style Image')
+imshow(style_img, title='Style Image')
 
 plt.figure()
-imshow(content_img.data, title='Content Image')
-
+imshow(content_img, title='Content Image')
 
 ######################################################################
-# Content loss
+# Loss Functions
+# --------------
+# Content Loss
 # ~~~~~~~~~~~~
-#
-# The content loss is a function that takes as input the feature maps
-# :math:`F_{XL}` at a layer :math:`L` in a network fed by :math:`X` and
-# return the weigthed content distance :math:`w_{CL}.D_C^L(X,C)` between
-# this image and the content image. Hence, the weight :math:`w_{CL}` and
-# the target content :math:`F_{CL}` are parameters of the function. We
+# 
+# The content loss is a function that represents a weighted version of the
+# content distance for an individual layer. The function takes the feature
+# maps :math:`F_{XL}` of a layer :math:`L` in a network processing input :math:`X` and returns the
+# weighted content distance :math:`w_{CL}.D_C^L(X,C)` between the image :math:`X` and the
+# content image :math:`C`. The feature maps of the content image(:math:`F_{CL}`) must be
+# known by the function in order to calculate the content distance. We
 # implement this function as a torch module with a constructor that takes
-# these parameters as input. The distance :math:`\|F_{XL} - F_{YL}\|^2` is
-# the Mean Square Error between the two sets of feature maps, that can be
-# computed using a criterion ``nn.MSELoss`` stated as a third parameter.
-#
-# We will add our content losses at each desired layer as additive modules
-# of the neural network. That way, each time we will feed the network with
-# an input image :math:`X`, all the content losses will be computed at the
-# desired layers and, thanks to autograd, all the gradients will be
-# computed. For that, we just need to make the ``forward`` method of our
-# module returning the input: the module becomes a ''transparent layer''
-# of the neural network. The computed loss is saved as a parameter of the
-# module.
-#
-# Finally, we define a fake ``backward`` method, that just call the
-# backward method of ``nn.MSELoss`` in order to reconstruct the gradient.
-# This method returns the computed loss: this will be usefull when running
-# the gradien descent in order to display the evolution of style and
-# content losses.
-#
+# :math:`F_{CL}` as an input. The distance :math:`\|F_{XL} - F_{CL}\|^2` is the mean square error
+# between the two sets of feature maps, and can be computed using ``nn.MSELoss``.
+# 
+# We will add this content loss module directly after the convolution
+# layer(s) that are being used to compute the content distance. This way
+# each time the network is fed an input image the content losses will be
+# computed at the desired layers and because of auto grad, all the
+# gradients will be computed. Now, in order to make the content loss layer
+# transparent we must define a ``forward`` method that computes the content
+# loss and then returns the layer’s input. The computed loss is saved as a
+# parameter of the module.
+# 
 
 class ContentLoss(nn.Module):
 
-    def __init__(self, target, weight):
+    def __init__(self, target,):
         super(ContentLoss, self).__init__()
         # we 'detach' the target content from the tree used
-        self.target = target.detach() * weight
         # to dynamically compute the gradient: this is a stated value,
         # not a variable. Otherwise the forward method of the criterion
         # will throw an error.
-        self.weight = weight
-        self.criterion = nn.MSELoss()
+        self.target = target.detach()
 
     def forward(self, input):
-        self.loss = self.criterion(input * self.weight, self.target)
-        self.output = input
-        return self.output
-
-    def backward(self, retain_variables=True):
-        self.loss.backward(retain_variables=retain_variables)
-        return self.loss
-
+        self.loss = F.mse_loss(input, self.target)
+        return input
 
 ######################################################################
-# .. Note::
-#    **Important detail**: this module, although it is named ``ContentLoss``,
+# .. note::
+#    **Important detail**: although this module is named ``ContentLoss``, it
 #    is not a true PyTorch Loss function. If you want to define your content
-#    loss as a PyTorch Loss, you have to create a PyTorch autograd Function
-#    and to recompute/implement the gradient by the hand in the ``backward``
+#    loss as a PyTorch Loss function, you have to create a PyTorch autograd function 
+#    to recompute/implement the gradient manually in the ``backward``
 #    method.
-#
-# Style loss
-# ~~~~~~~~~~
-#
-# For the style loss, we need first to define a module that compute the
-# gram produce :math:`G_{XL}` given the feature maps :math:`F_{XL}` of the
-# neural network fed by :math:`X`, at layer :math:`L`. Let
-# :math:`\hat{F}_{XL}` be the re-shaped version of :math:`F_{XL}` into a
-# :math:`K`\ x\ :math:`N` matrix, where :math:`K` is the number of feature
-# maps at layer :math:`L` and :math:`N` the lenght of any vectorized
-# feature map :math:`F_{XL}^k`. The :math:`k^{th}` line of
-# :math:`\hat{F}_{XL}` is :math:`F_{XL}^k`. We let you check that
-# :math:`\hat{F}_{XL} \cdot \hat{F}_{XL}^T = G_{XL}`. Given that, it
-# becomes easy to implement our module:
-#
-
-class GramMatrix(nn.Module):
-
-    def forward(self, input):
-        a, b, c, d = input.size()  # a=batch size(=1)
-        # b=number of feature maps
-        # (c,d)=dimensions of a f. map (N=c*d)
 
-        features = input.view(a * b, c * d)  # resise F_XL into \hat F_XL
-
-        G = torch.mm(features, features.t())  # compute the gram product
-
-        # we 'normalize' the values of the gram matrix
-        # by dividing by the number of element in each feature maps.
-        return G.div(a * b * c * d)
+######################################################################
+# Style Loss
+# ~~~~~~~~~~
+# 
+# The style loss module is implemented similarly to the content loss
+# module. It will act as a transparent layer in a
+# network that computes the style loss of that layer. In order to
+# calculate the style loss, we need to compute the gram matrix :math:`G_{XL}`. A gram
+# matrix is the result of multiplying a given matrix by its transposed
+# matrix. In this application the given matrix is a reshaped version of
+# the feature maps :math:`F_{XL}` of a layer :math:`L`. :math:`F_{XL}` is reshaped to form :math:`\hat{F}_{XL}`, a :math:`K`\ x\ :math:`N`
+# matrix, where :math:`K` is the number of feature maps at layer :math:`L` and :math:`N` is the
+# length of any vectorized feature map :math:`F_{XL}^k`. For example, the first line
+# of :math:`\hat{F}_{XL}` corresponds to the first vectorized feature map :math:`F_{XL}^1`.
+# 
+# Finally, the gram matrix must be normalized by dividing each element by
+# the total number of elements in the matrix. This normalization is to
+# counteract the fact that :math:`\hat{F}_{XL}` matrices with a large :math:`N` dimension yield
+# larger values in the Gram matrix. These larger values will cause the
+# first layers (before pooling layers) to have a larger impact during the
+# gradient descent. Style features tend to be in the deeper layers of the
+# network so this normalization step is crucial.
+# 
+
+def gram_matrix(input):
+    a, b, c, d = input.size()  # a=batch size(=1)
+    # b=number of feature maps
+    # (c,d)=dimensions of a f. map (N=c*d)
+
+    features = input.view(a * b, c * d)  # resize F_XL into \hat F_XL
+
+    G = torch.mm(features, features.t())  # compute the gram product
+
+    # we 'normalize' the values of the gram matrix
+    # by dividing by the number of element in each feature maps.
+    return G.div(a * b * c * d)
 
 
 ######################################################################
-# The longer is the feature maps dimension :math:`N`, the bigger are the
-# values of the gram matrix. Therefore, if we don't normalize by :math:`N`,
-# the loss computed at the first layers (before pooling layers) will have
-# much more importance during the gradient descent. We dont want that,
-# since the most interesting style features are in the deepest layers!
-#
-# Then, the style loss module is implemented exactly the same way than the
-# content loss module, but we have to add the ``gramMatrix`` as a
-# parameter:
-#
+# Now the style loss module looks almost exactly like the content loss
+# module. The style distance is also computed using the mean square
+# error between :math:`G_{XL}` and :math:`G_{SL}`.
+# 
 
 class StyleLoss(nn.Module):
 
-    def __init__(self, target, weight):
+    def __init__(self, target_feature):
         super(StyleLoss, self).__init__()
-        self.target = target.detach() * weight
-        self.weight = weight
-        self.gram = GramMatrix()
-        self.criterion = nn.MSELoss()
+        self.target = gram_matrix(target_feature).detach()
 
     def forward(self, input):
-        self.output = input.clone()
-        self.G = self.gram(input)
-        self.G.mul_(self.weight)
-        self.loss = self.criterion(self.G, self.target)
-        return self.output
-
-    def backward(self, retain_variables=True):
-        self.loss.backward(retain_variables=retain_variables)
-        return self.loss
+        G = gram_matrix(input)
+        self.loss = F.mse_loss(G, self.target)
+        return input
 
 
 ######################################################################
-# Load the neural network
-# ~~~~~~~~~~~~~~~~~~~~~~~
-#
-# Now, we have to import a pre-trained neural network. As in the paper, we
-# are going to use a pretrained VGG network with 19 layers (VGG19).
-#
-# PyTorch's implementation of VGG is a module divided in two child
-# ``Sequential`` modules: ``features`` (containing convolution and pooling
-# layers) and ``classifier`` (containing fully connected layers). We are
-# just interested by ``features``:
-#
+# Importing the Model
+# -------------------
+# 
+# Now we need to import a pretrained neural network. We will use a 19
+# layer VGG network like the one used in the paper.
+# 
+# PyTorch’s implementation of VGG is a module divided into two child
+# ``Sequential`` modules: ``features`` (containing convolution and pooling layers),
+# and ``classifier`` (containing fully connected layers). We will use the
+# ``features`` module because we need the output of the individual
+# convolution layers to measure content and style loss. Some layers have
+# different behavior during training than evaluation, so we must set the
+# network to evaluation mode using ``.eval()``.
+# 
+
+cnn = vgg19(weights=VGG19_Weights.DEFAULT).features.eval()
 
-cnn = models.vgg19(pretrained=True).features
 
-# move it to the GPU if possible:
-if use_cuda:
-    cnn = cnn.cuda()
+
+######################################################################
+# Additionally, VGG networks are trained on images with each channel
+# normalized by mean=[0.485, 0.456, 0.406] and std=[0.229, 0.224, 0.225].
+# We will use them to normalize the image before sending it into the network.
+# 
+
+cnn_normalization_mean = torch.tensor([0.485, 0.456, 0.406])
+cnn_normalization_std = torch.tensor([0.229, 0.224, 0.225])
+
+# create a module to normalize input image so we can easily put it in a
+# ``nn.Sequential``
+class Normalization(nn.Module):
+    def __init__(self, mean, std):
+        super(Normalization, self).__init__()
+        # .view the mean and std to make them [C x 1 x 1] so that they can
+        # directly work with image Tensor of shape [B x C x H x W].
+        # B is batch size. C is number of channels. H is height and W is width.
+        self.mean = torch.tensor(mean).view(-1, 1, 1)
+        self.std = torch.tensor(std).view(-1, 1, 1)
+
+    def forward(self, img):
+        # normalize ``img``
+        return (img - self.mean) / self.std
 
 
 ######################################################################
 # A ``Sequential`` module contains an ordered list of child modules. For
-# instance, ``vgg19.features`` contains a sequence (Conv2d, ReLU,
-# Maxpool2d, Conv2d, ReLU...) aligned in the right order of depth. As we
-# said in *Content loss* section, we wand to add our style and content
-# loss modules as additive 'transparent' layers in our network, at desired
-# depths. For that, we construct a new ``Sequential`` module, in wich we
-# are going to add modules from ``vgg19`` and our loss modules in the
-# right order:
-#
+# instance, ``vgg19.features`` contains a sequence (``Conv2d``, ``ReLU``, ``MaxPool2d``,
+# ``Conv2d``, ``ReLU``…) aligned in the right order of depth. We need to add our
+# content loss and style loss layers immediately after the convolution
+# layer they are detecting. To do this we must create a new ``Sequential``
+# module that has content loss and style loss modules correctly inserted.
+# 
 
 # desired depth layers to compute style/content losses :
 content_layers_default = ['conv_4']
 style_layers_default = ['conv_1', 'conv_2', 'conv_3', 'conv_4', 'conv_5']
 
-
-def get_style_model_and_losses(cnn, style_img, content_img,
-                               style_weight=1000, content_weight=1,
+def get_style_model_and_losses(cnn, normalization_mean, normalization_std,
+                               style_img, content_img,
                                content_layers=content_layers_default,
                                style_layers=style_layers_default):
-    cnn = copy.deepcopy(cnn)
+    # normalization module
+    normalization = Normalization(normalization_mean, normalization_std)
 
-    # just in order to have an iterable access to or list of content/syle
+    # just in order to have an iterable access to or list of content/style
     # losses
     content_losses = []
     style_losses = []
 
-    model = nn.Sequential()  # the new Sequential module network
-    gram = GramMatrix()  # we need a gram module in order to compute style targets
+    # assuming that ``cnn`` is a ``nn.Sequential``, so we make a new ``nn.Sequential``
+    # to put in modules that are supposed to be activated sequentially
+    model = nn.Sequential(normalization)
 
-    # move these modules to the GPU if possible:
-    if use_cuda:
-        model = model.cuda()
-        gram = gram.cuda()
-
-    i = 1
-    for layer in list(cnn):
+    i = 0  # increment every time we see a conv
+    for layer in cnn.children():
         if isinstance(layer, nn.Conv2d):
-            name = "conv_" + str(i)
-            model.add_module(name, layer)
-
-            if name in content_layers:
-                # add content loss:
-                target = model(content_img).clone()
-                content_loss = ContentLoss(target, content_weight)
-                model.add_module("content_loss_" + str(i), content_loss)
-                content_losses.append(content_loss)
-
-            if name in style_layers:
-                # add style loss:
-                target_feature = model(style_img).clone()
-                target_feature_gram = gram(target_feature)
-                style_loss = StyleLoss(target_feature_gram, style_weight)
-                model.add_module("style_loss_" + str(i), style_loss)
-                style_losses.append(style_loss)
-
-        if isinstance(layer, nn.ReLU):
-            name = "relu_" + str(i)
-            model.add_module(name, layer)
-
-            if name in content_layers:
-                # add content loss:
-                target = model(content_img).clone()
-                content_loss = ContentLoss(target, content_weight)
-                model.add_module("content_loss_" + str(i), content_loss)
-                content_losses.append(content_loss)
-
-            if name in style_layers:
-                # add style loss:
-                target_feature = model(style_img).clone()
-                target_feature_gram = gram(target_feature)
-                style_loss = StyleLoss(target_feature_gram, style_weight)
-                model.add_module("style_loss_" + str(i), style_loss)
-                style_losses.append(style_loss)
-
             i += 1
-
-        if isinstance(layer, nn.MaxPool2d):
-            name = "pool_" + str(i)
-            model.add_module(name, layer)  # ***
+            name = 'conv_{}'.format(i)
+        elif isinstance(layer, nn.ReLU):
+            name = 'relu_{}'.format(i)
+            # The in-place version doesn't play very nicely with the ``ContentLoss``
+            # and ``StyleLoss`` we insert below. So we replace with out-of-place
+            # ones here.
+            layer = nn.ReLU(inplace=False)
+        elif isinstance(layer, nn.MaxPool2d):
+            name = 'pool_{}'.format(i)
+        elif isinstance(layer, nn.BatchNorm2d):
+            name = 'bn_{}'.format(i)
+        else:
+            raise RuntimeError('Unrecognized layer: {}'.format(layer.__class__.__name__))
+
+        model.add_module(name, layer)
+
+        if name in content_layers:
+            # add content loss:
+            target = model(content_img).detach()
+            content_loss = ContentLoss(target)
+            model.add_module("content_loss_{}".format(i), content_loss)
+            content_losses.append(content_loss)
+
+        if name in style_layers:
+            # add style loss:
+            target_feature = model(style_img).detach()
+            style_loss = StyleLoss(target_feature)
+            model.add_module("style_loss_{}".format(i), style_loss)
+            style_losses.append(style_loss)
+
+    # now we trim off the layers after the last content and style losses
+    for i in range(len(model) - 1, -1, -1):
+        if isinstance(model[i], ContentLoss) or isinstance(model[i], StyleLoss):
+            break
+
+    model = model[:(i + 1)]
 
     return model, style_losses, content_losses
 
 
 ######################################################################
-# .. Note::
-#    In the paper they recommend to change max pooling layers into
-#    average pooling. With AlexNet, that is a small network compared to VGG19
-#    used in the paper, we are not going to see any difference of quality in
-#    the result. However, you can use these lines instead if you want to do
-#    this substitution:
-#
-#    ::
-#
-#        # avgpool = nn.AvgPool2d(kernel_size=layer.kernel_size,
-#        #                         stride=layer.stride, padding = layer.padding)
-#        # model.add_module(name,avgpool)
-
+# Next, we select the input image. You can use a copy of the content image
+# or white noise.
+# 
 
-######################################################################
-# Input image
-# ~~~~~~~~~~~
+input_img = content_img.clone()
+# if you want to use white noise by using the following code:
 #
-# Again, in order to simplify the code, we take an image of the same
-# dimensions than content and style images. This image can be a white
-# noise, or it can also be a copy of the content-image.
+# .. code-block:: python
 #
-
-input_img = content_img.clone()
-# if you want to use a white noise instead uncomment the below line:
-# input_img = Variable(torch.randn(content_img.data.size())).type(dtype)
+#    input_img = torch.randn(content_img.data.size())
 
 # add the original input image to the figure:
 plt.figure()
-imshow(input_img.data, title='Input Image')
+imshow(input_img, title='Input Image')
 
 
 ######################################################################
-# Gradient descent
-# ~~~~~~~~~~~~~~~~
-#
-# As Leon Gatys, the author of the algorithm, suggested
-# `here <https://discuss.pytorch.org/t/pytorch-tutorial-for-neural-transfert-of-artistic-style/336/20?u=alexis-jacq>`__,
-# we will use L-BFGS algorithm to run our gradient descent. Unlike
-# training a network, we want to train the input image in order to
-# minimise the content/style losses. We would like to simply create a
-# PyTorch  L-BFGS optimizer, passing our image as the variable to optimize.
-# But ``optim.LBFGS`` takes as first argument a list of PyTorch
-# ``Variable`` that require gradient. Our input image is a ``Variable``
-# but is not a leaf of the tree that requires computation of gradients. In
-# order to show that this variable requires a gradient, a possibility is
-# to construct a ``Parameter`` object from the input image. Then, we just
-# give a list containing this ``Parameter`` to the optimizer's
-# constructor:
-#
-
-def get_input_param_optimizer(input_img):
+# Gradient Descent
+# ----------------
+# 
+# As Leon Gatys, the author of the algorithm, suggested `here <https://discuss.pytorch.org/t/pytorch-tutorial-for-neural-transfert-of-artistic-style/336/20?u=alexis-jacq>`__, we will use
+# L-BFGS algorithm to run our gradient descent. Unlike training a network,
+# we want to train the input image in order to minimize the content/style
+# losses. We will create a PyTorch L-BFGS optimizer ``optim.LBFGS`` and pass
+# our image to it as the tensor to optimize.
+# 
+
+def get_input_optimizer(input_img):
     # this line to show that input is a parameter that requires a gradient
-    input_param = nn.Parameter(input_img.data)
-    optimizer = optim.LBFGS([input_param])
-    return input_param, optimizer
+    optimizer = optim.LBFGS([input_img])
+    return optimizer
 
 
 ######################################################################
-# **Last step**: the loop of gradient descent. At each step, we must feed
-# the network with the updated input in order to compute the new losses,
-# we must run the ``backward`` methods of each loss to dynamically compute
-# their gradients and perform the step of gradient descent. The optimizer
-# requires as argument a "closure": a function that reevaluates the model
-# and returns the loss.
-#
-# However, there's a small catch. The optimized image may take its values
-# between :math:`-\infty` and :math:`+\infty` instead of staying between 0
-# and 1. In other words, the image might be well optimized and have absurd
-# values. In fact, we must perform an optimization under constraints in
-# order to keep having right vaues into our input image. There is a simple
-# solution: at each step, to correct the image to maintain its values into
-# the 0-1 interval.
-#
-
-def run_style_transfer(cnn, content_img, style_img, input_img, num_steps=300,
-                       style_weight=1000, content_weight=1):
+# Finally, we must define a function that performs the neural transfer. For
+# each iteration of the networks, it is fed an updated input and computes
+# new losses. We will run the ``backward`` methods of each loss module to
+# dynamically compute their gradients. The optimizer requires a “closure”
+# function, which reevaluates the module and returns the loss.
+# 
+# We still have one final constraint to address. The network may try to
+# optimize the input with values that exceed the 0 to 1 tensor range for
+# the image. We can address this by correcting the input values to be
+# between 0 to 1 each time the network is run.
+# 
+
+def run_style_transfer(cnn, normalization_mean, normalization_std,
+                       content_img, style_img, input_img, num_steps=300,
+                       style_weight=1000000, content_weight=1):
     """Run the style transfer."""
     print('Building the style transfer model..')
     model, style_losses, content_losses = get_style_model_and_losses(cnn,
-        style_img, content_img, style_weight, content_weight)
-    input_param, optimizer = get_input_param_optimizer(input_img)
+        normalization_mean, normalization_std, style_img, content_img)
+
+    # We want to optimize the input and not the model parameters so we
+    # update all the requires_grad fields accordingly
+    input_img.requires_grad_(True)
+    # We also put the model in evaluation mode, so that specific layers 
+    # such as dropout or batch normalization layers behave correctly. 
+    model.eval()
+    model.requires_grad_(False)
+
+    optimizer = get_input_optimizer(input_img)
 
     print('Optimizing..')
     run = [0]
@@ -561,38 +435,49 @@ def run_style_transfer(cnn, content_img, style_img, input_img, num_steps=300,
 
         def closure():
             # correct the values of updated input image
-            input_param.data.clamp_(0, 1)
+            with torch.no_grad():
+                input_img.clamp_(0, 1)
 
             optimizer.zero_grad()
-            model(input_param)
+            model(input_img)
             style_score = 0
             content_score = 0
 
             for sl in style_losses:
-                style_score += sl.backward()
+                style_score += sl.loss
             for cl in content_losses:
-                content_score += cl.backward()
+                content_score += cl.loss
+
+            style_score *= style_weight
+            content_score *= content_weight
+
+            loss = style_score + content_score
+            loss.backward()
 
             run[0] += 1
             if run[0] % 50 == 0:
                 print("run {}:".format(run))
                 print('Style Loss : {:4f} Content Loss: {:4f}'.format(
-                    style_score.data[0], content_score.data[0]))
+                    style_score.item(), content_score.item()))
                 print()
 
-            return style_score + style_score
+            return style_score + content_score
 
         optimizer.step(closure)
 
     # a last correction...
-    input_param.data.clamp_(0, 1)
+    with torch.no_grad():
+        input_img.clamp_(0, 1)
+
+    return input_img
 
-    return input_param.data
 
 ######################################################################
-# Finally, run the algorithm
+# Finally, we can run the algorithm.
+# 
 
-output = run_style_transfer(cnn, content_img, style_img, input_img)
+output = run_style_transfer(cnn, cnn_normalization_mean, cnn_normalization_std,
+                            content_img, style_img, input_img)
 
 plt.figure()
 imshow(output, title='Output Image')
@@ -600,3 +485,4 @@ def closure():
 # sphinx_gallery_thumbnail_number = 4
 plt.ioff()
 plt.show()
+
diff --git a/advanced_source/numpy_extensions_tutorial.py b/advanced_source/numpy_extensions_tutorial.py
index 9a0eed987e6..8ccd92d3765 100644
--- a/advanced_source/numpy_extensions_tutorial.py
+++ b/advanced_source/numpy_extensions_tutorial.py
@@ -1,23 +1,24 @@
 # -*- coding: utf-8 -*-
 """
-Creating extensions using numpy and scipy
+Creating Extensions Using NumPy and SciPy
 =========================================
 **Author**: `Adam Paszke <https://github.com/apaszke>`_
 
+**Updated by**: `Adam Dziedzic <https://github.com/adam-dziedzic>`_
+
 In this tutorial, we shall go through two tasks:
 
 1. Create a neural network layer with no parameters.
 
-    -  This calls into **numpy** as part of it’s implementation
+    -  This calls into **numpy** as part of its implementation
 
 2. Create a neural network layer that has learnable weights
 
-    -  This calls into **SciPy** as part of it’s implementation
+    -  This calls into **SciPy** as part of its implementation
 """
 
 import torch
 from torch.autograd import Function
-from torch.autograd import Variable
 
 ###############################################################
 # Parameter-less example
@@ -26,7 +27,7 @@
 # This layer doesn’t particularly do anything useful or mathematically
 # correct.
 #
-# It is aptly named BadFFTFunction
+# It is aptly named ``BadFFTFunction``
 #
 # **Layer Implementation**
 
@@ -34,87 +35,106 @@
 
 
 class BadFFTFunction(Function):
-
-    def forward(self, input):
-        numpy_input = input.numpy()
+    @staticmethod
+    def forward(ctx, input):
+        numpy_input = input.detach().numpy()
         result = abs(rfft2(numpy_input))
-        return torch.FloatTensor(result)
+        return input.new(result)
 
-    def backward(self, grad_output):
+    @staticmethod
+    def backward(ctx, grad_output):
         numpy_go = grad_output.numpy()
         result = irfft2(numpy_go)
-        return torch.FloatTensor(result)
+        return grad_output.new(result)
 
 # since this layer does not have any parameters, we can
-# simply declare this as a function, rather than as an nn.Module class
+# simply declare this as a function, rather than as an ``nn.Module`` class
 
 
 def incorrect_fft(input):
-    return BadFFTFunction()(input)
+    return BadFFTFunction.apply(input)
 
 ###############################################################
 # **Example usage of the created layer:**
 
-input = Variable(torch.randn(8, 8), requires_grad=True)
+input = torch.randn(8, 8, requires_grad=True)
 result = incorrect_fft(input)
-print(result.data)
+print(result)
 result.backward(torch.randn(result.size()))
-print(input.grad)
+print(input)
 
 ###############################################################
 # Parametrized example
 # --------------------
 #
-# This implements a layer with learnable weights.
-#
-# It implements the Cross-correlation with a learnable kernel.
+# In deep learning literature, this layer is confusingly referred
+# to as convolution while the actual operation is cross-correlation
+# (the only difference is that filter is flipped for convolution,
+# which is not the case for cross-correlation).
 #
-# In deep learning literature, it’s confusingly referred to as
-# Convolution.
+# Implementation of a layer with learnable weights, where cross-correlation
+# has a filter (kernel) that represents weights.
 #
-# The backward computes the gradients wrt the input and gradients wrt the
-# filter.
-#
-# **Implementation:**
-#
-# *Please Note that the implementation serves as an illustration, and we
-# did not verify it’s correctness*
+# The backward pass computes the gradient ``wrt`` the input and the gradient ``wrt`` the filter.
 
+from numpy import flip
+import numpy as np
 from scipy.signal import convolve2d, correlate2d
 from torch.nn.modules.module import Module
 from torch.nn.parameter import Parameter
 
 
 class ScipyConv2dFunction(Function):
-
-    def forward(self, input, filter):
+    @staticmethod
+    def forward(ctx, input, filter, bias):
+        # detach so we can cast to NumPy
+        input, filter, bias = input.detach(), filter.detach(), bias.detach()
         result = correlate2d(input.numpy(), filter.numpy(), mode='valid')
-        self.save_for_backward(input, filter)
-        return torch.FloatTensor(result)
-
-    def backward(self, grad_output):
-        input, filter = self.saved_tensors
-        grad_input = convolve2d(grad_output.numpy(), filter.t().numpy(), mode='full')
-        grad_filter = convolve2d(input.numpy(), grad_output.numpy(), mode='valid')
-        return torch.FloatTensor(grad_input), torch.FloatTensor(grad_filter)
+        result += bias.numpy()
+        ctx.save_for_backward(input, filter, bias)
+        return torch.as_tensor(result, dtype=input.dtype)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        grad_output = grad_output.detach()
+        input, filter, bias = ctx.saved_tensors
+        grad_output = grad_output.numpy()
+        grad_bias = np.sum(grad_output, keepdims=True)
+        grad_input = convolve2d(grad_output, filter.numpy(), mode='full')
+        # the previous line can be expressed equivalently as:
+        # grad_input = correlate2d(grad_output, flip(flip(filter.numpy(), axis=0), axis=1), mode='full')
+        grad_filter = correlate2d(input.numpy(), grad_output, mode='valid')
+        return torch.from_numpy(grad_input), torch.from_numpy(grad_filter).to(torch.float), torch.from_numpy(grad_bias).to(torch.float)
 
 
 class ScipyConv2d(Module):
-
-    def __init__(self, kh, kw):
+    def __init__(self, filter_width, filter_height):
         super(ScipyConv2d, self).__init__()
-        self.filter = Parameter(torch.randn(kh, kw))
+        self.filter = Parameter(torch.randn(filter_width, filter_height))
+        self.bias = Parameter(torch.randn(1, 1))
 
     def forward(self, input):
-        return ScipyConv2dFunction()(input, self.filter)
+        return ScipyConv2dFunction.apply(input, self.filter, self.bias)
+
 
 ###############################################################
 # **Example usage:**
 
 module = ScipyConv2d(3, 3)
-print(list(module.parameters()))
-input = Variable(torch.randn(10, 10), requires_grad=True)
+print("Filter and bias: ", list(module.parameters()))
+input = torch.randn(10, 10, requires_grad=True)
 output = module(input)
-print(output)
+print("Output from the convolution: ", output)
 output.backward(torch.randn(8, 8))
-print(input.grad)
+print("Gradient for the input map: ", input.grad)
+
+###############################################################
+# **Check the gradients:**
+
+from torch.autograd.gradcheck import gradcheck
+
+moduleConv = ScipyConv2d(3, 3)
+
+input = [torch.randn(20, 20, dtype=torch.double, requires_grad=True)]
+test = gradcheck(moduleConv, input, eps=1e-6, atol=1e-4)
+print("Are the gradients correct: ", test)
diff --git a/advanced_source/pendulum.py b/advanced_source/pendulum.py
new file mode 100644
index 00000000000..3084fe8312b
--- /dev/null
+++ b/advanced_source/pendulum.py
@@ -0,0 +1,930 @@
+# -*- coding: utf-8 -*-
+
+"""
+Pendulum: Writing your environment and transforms with TorchRL
+==============================================================
+
+**Author**: `Vincent Moens <https://github.com/vmoens>`_
+
+Creating an environment (a simulator or an interface to a physical control system)
+is an integrative part of reinforcement learning and control engineering.
+
+TorchRL provides a set of tools to do this in multiple contexts.
+This tutorial demonstrates how to use PyTorch and TorchRL code a pendulum
+simulator from the ground up.
+It is freely inspired by the Pendulum-v1 implementation from `OpenAI-Gym/Farama-Gymnasium
+control library <https://github.com/Farama-Foundation/Gymnasium>`__.
+
+.. figure:: /_static/img/pendulum.gif
+   :alt: Pendulum
+   :align: center
+
+   Simple Pendulum
+
+Key learnings:
+
+- How to design an environment in TorchRL:
+  - Writing specs (input, observation and reward);
+  - Implementing behavior: seeding, reset and step.
+- Transforming your environment inputs and outputs, and writing your own
+  transforms;
+- How to use :class:`~tensordict.TensorDict` to carry arbitrary data structures 
+  through the ``codebase``.
+
+  In the process, we will touch three crucial components of TorchRL:
+
+* `environments <https://pytorch.org/rl/stable/reference/envs.html>`__
+* `transforms <https://pytorch.org/rl/stable/reference/envs.html#transforms>`__
+* `models (policy and value function) <https://pytorch.org/rl/stable/reference/modules.html>`__
+
+"""
+
+######################################################################
+# To give a sense of what can be achieved with TorchRL's environments, we will
+# be designing a *stateless* environment. While stateful environments keep track of
+# the latest physical state encountered and rely on this to simulate the state-to-state
+# transition, stateless environments expect the current state to be provided to
+# them at each step, along with the action undertaken. TorchRL supports both
+# types of environments, but stateless environments are more generic and hence
+# cover a broader range of features of the environment API in TorchRL.
+#
+# Modeling stateless environments gives users full control over the input and
+# outputs of the simulator: one can reset an experiment at any stage or actively
+# modify the dynamics from the outside. However, it assumes that we have some control
+# over a task, which may not always be the case: solving a problem where we cannot
+# control the current state is more challenging but has a much wider set of applications.
+#
+# Another advantage of stateless environments is that they can enable
+# batched execution of transition simulations. If the backend and the
+# implementation allow it, an algebraic operation can be executed seamlessly on
+# scalars, vectors, or tensors. This tutorial gives such examples.
+#
+# This tutorial will be structured as follows:
+#
+# * We will first get acquainted with the environment properties:
+#   its shape (``batch_size``), its methods (mainly :meth:`~torchrl.envs.EnvBase.step`,
+#   :meth:`~torchrl.envs.EnvBase.reset` and :meth:`~torchrl.envs.EnvBase.set_seed`)
+#   and finally its specs.
+# * After having coded our simulator, we will demonstrate how it can be used
+#   during training with transforms.
+# * We will explore new avenues that follow from the TorchRL's API,
+#   including: the possibility of transforming inputs, the vectorized execution
+#   of the simulation and the possibility of backpropagation through the
+#   simulation graph.
+# * Finally, we will train a simple policy to solve the system we implemented.
+#
+
+# sphinx_gallery_start_ignore
+import warnings
+
+warnings.filterwarnings("ignore")
+from torch import multiprocessing
+
+# TorchRL prefers spawn method, that restricts creation of  ``~torchrl.envs.ParallelEnv`` inside
+# `__main__` method call, but for the easy of reading the code switch to fork
+# which is also a default spawn method in Google's Colaboratory
+try:
+    multiprocessing.set_start_method("fork")
+except RuntimeError:
+    pass
+
+# sphinx_gallery_end_ignore
+
+from collections import defaultdict
+from typing import Optional
+
+import numpy as np
+import torch
+import tqdm
+from tensordict import TensorDict, TensorDictBase
+from tensordict.nn import TensorDictModule
+from torch import nn
+
+from torchrl.data import BoundedTensorSpec, CompositeSpec, UnboundedContinuousTensorSpec
+from torchrl.envs import (
+    CatTensors,
+    EnvBase,
+    Transform,
+    TransformedEnv,
+    UnsqueezeTransform,
+)
+from torchrl.envs.transforms.transforms import _apply_to_composite
+from torchrl.envs.utils import check_env_specs, step_mdp
+
+DEFAULT_X = np.pi
+DEFAULT_Y = 1.0
+
+######################################################################
+# There are four things you must take care of when designing a new environment
+# class:
+#
+# * :meth:`EnvBase._reset`, which codes for the resetting of the simulator
+#   at a (potentially random) initial state;
+# * :meth:`EnvBase._step` which codes for the state transition dynamic;
+# * :meth:`EnvBase._set_seed`` which implements the seeding mechanism;
+# * the environment specs.
+#
+# Let us first describe the problem at hand: we would like to model a simple
+# pendulum over which we can control the torque applied on its fixed point.
+# Our goal is to place the pendulum in upward position (angular position at 0
+# by convention) and having it standing still in that position.
+# To design our dynamic system, we need to define two equations: the motion
+# equation following an action (the torque applied) and the reward equation
+# that will constitute our objective function.
+#
+# For the motion equation, we will update the angular velocity following:
+#
+# .. math::
+#
+#    \dot{\theta}_{t+1} = \dot{\theta}_t + (3 * g / (2 * L) * \sin(\theta_t) + 3 / (m * L^2) * u) * dt
+#
+# where :math:`\dot{\theta}` is the angular velocity in rad/sec, :math:`g` is the
+# gravitational force, :math:`L` is the pendulum length, :math:`m` is its mass,
+# :math:`\theta` is its angular position and :math:`u` is the torque. The
+# angular position is then updated according to
+#
+# .. math::
+#
+#    \theta_{t+1} = \theta_{t} + \dot{\theta}_{t+1} dt
+#
+# We define our reward as
+#
+# .. math::
+#
+#    r = -(\theta^2 + 0.1 * \dot{\theta}^2 + 0.001 * u^2)
+#
+# which will be maximized when the angle is close to 0 (pendulum in upward
+# position), the angular velocity is close to 0 (no motion) and the torque is
+# 0 too.
+#
+# Coding the effect of an action: :func:`~torchrl.envs.EnvBase._step`
+# -------------------------------------------------------------------
+#
+# The step method is the first thing to consider, as it will encode
+# the simulation that is of interest to us. In TorchRL, the
+# :class:`~torchrl.envs.EnvBase` class has a :meth:`EnvBase.step`
+# method that receives a :class:`tensordict.TensorDict`
+# instance with an ``"action"`` entry indicating what action is to be taken.
+#
+# To facilitate the reading and writing from that ``tensordict`` and to make sure
+# that the keys are consistent with what's expected from the library, the
+# simulation part has been delegated to a private abstract method :meth:`_step`
+# which reads input data from a ``tensordict``, and writes a *new*  ``tensordict``
+# with the output data.
+#
+# The :func:`_step` method should do the following:
+#
+#   1. Read the input keys (such as ``"action"``) and execute the simulation
+#      based on these;
+#   2. Retrieve observations, done state and reward;
+#   3. Write the set of observation values along with the reward and done state
+#      at the corresponding entries in a new :class:`TensorDict`.
+#
+# Next, the :meth:`~torchrl.envs.EnvBase.step` method will merge the output
+# of :meth:`~torchrl.envs.EnvBase.step` in the input ``tensordict`` to enforce
+# input/output consistency.
+#
+# Typically, for stateful environments, this will look like this:
+#
+# .. code-block::
+#
+#   >>> policy(env.reset())
+#   >>> print(tensordict)
+#   TensorDict(
+#       fields={
+#           action: Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.float32, is_shared=False),
+#           done: Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.bool, is_shared=False),
+#           observation: Tensor(shape=torch.Size([]), device=cpu, dtype=torch.float32, is_shared=False)},
+#       batch_size=torch.Size([]),
+#       device=cpu,
+#       is_shared=False)
+#   >>> env.step(tensordict)
+#   >>> print(tensordict)
+#   TensorDict(
+#       fields={
+#           action: Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.float32, is_shared=False),
+#           done: Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.bool, is_shared=False),
+#           next: TensorDict(
+#               fields={
+#                   done: Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.bool, is_shared=False),
+#                   observation: Tensor(shape=torch.Size([]), device=cpu, dtype=torch.float32, is_shared=False),
+#                   reward: Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.float32, is_shared=False)},
+#               batch_size=torch.Size([]),
+#               device=cpu,
+#               is_shared=False),
+#           observation: Tensor(shape=torch.Size([]), device=cpu, dtype=torch.float32, is_shared=False)},
+#       batch_size=torch.Size([]),
+#       device=cpu,
+#       is_shared=False)
+#
+# Notice that the root ``tensordict`` has not changed, the only modification is the
+# appearance of a new ``"next"`` entry that contains the new information.
+#
+# In the Pendulum example, our :meth:`_step` method will read the relevant
+# entries from the input ``tensordict`` and compute the position and velocity of
+# the pendulum after the force encoded by the ``"action"`` key has been applied
+# onto it. We compute the new angular position of the pendulum
+# ``"new_th"`` as the result of the previous position ``"th"`` plus the new
+# velocity ``"new_thdot"`` over a time interval ``dt``.
+#
+# Since our goal is to turn the pendulum up and maintain it still in that
+# position, our ``cost`` (negative reward) function is lower for positions
+# close to the target and low speeds.
+# Indeed, we want to discourage positions that are far from being "upward"
+# and/or speeds that are far from 0.
+#
+# In our example, :meth:`EnvBase._step` is encoded as a static method since our
+# environment is stateless. In stateful settings, the ``self`` argument is
+# needed as the state needs to be read from the environment.
+#
+
+
+def _step(tensordict):
+    th, thdot = tensordict["th"], tensordict["thdot"]  # th := theta
+
+    g_force = tensordict["params", "g"]
+    mass = tensordict["params", "m"]
+    length = tensordict["params", "l"]
+    dt = tensordict["params", "dt"]
+    u = tensordict["action"].squeeze(-1)
+    u = u.clamp(-tensordict["params", "max_torque"], tensordict["params", "max_torque"])
+    costs = angle_normalize(th) ** 2 + 0.1 * thdot**2 + 0.001 * (u**2)
+
+    new_thdot = (
+        thdot
+        + (3 * g_force / (2 * length) * th.sin() + 3.0 / (mass * length**2) * u) * dt
+    )
+    new_thdot = new_thdot.clamp(
+        -tensordict["params", "max_speed"], tensordict["params", "max_speed"]
+    )
+    new_th = th + new_thdot * dt
+    reward = -costs.view(*tensordict.shape, 1)
+    done = torch.zeros_like(reward, dtype=torch.bool)
+    out = TensorDict(
+        {
+            "th": new_th,
+            "thdot": new_thdot,
+            "params": tensordict["params"],
+            "reward": reward,
+            "done": done,
+        },
+        tensordict.shape,
+    )
+    return out
+
+
+def angle_normalize(x):
+    return ((x + torch.pi) % (2 * torch.pi)) - torch.pi
+
+
+######################################################################
+# Resetting the simulator: :func:`~torchrl.envs.EnvBase._reset`
+# -------------------------------------------------------------
+#
+# The second method we need to care about is the
+# :meth:`~torchrl.envs.EnvBase._reset` method. Like
+# :meth:`~torchrl.envs.EnvBase._step`, it should write the observation entries
+# and possibly a done state in the ``tensordict`` it outputs (if the done state is
+# omitted, it will be filled as ``False`` by the parent method
+# :meth:`~torchrl.envs.EnvBase.reset`). In some contexts, it is required that
+# the ``_reset`` method receives a command from the function that called
+# it (for example, in multi-agent settings we may want to indicate which agents need
+# to be reset). This is why the :meth:`~torchrl.envs.EnvBase._reset` method
+# also expects a ``tensordict`` as input, albeit it may perfectly be empty or
+# ``None``.
+#
+# The parent :meth:`EnvBase.reset` does some simple checks like the
+# :meth:`EnvBase.step` does, such as making sure that a ``"done"`` state
+# is returned in the output ``tensordict`` and that the shapes match what is
+# expected from the specs.
+#
+# For us, the only important thing to consider is whether
+# :meth:`EnvBase._reset` contains all the expected observations. Once more,
+# since we are working with a stateless environment, we pass the configuration
+# of the pendulum in a nested ``tensordict`` named ``"params"``.
+#
+# In this example, we do not pass a done state as this is not mandatory
+# for :meth:`_reset` and our environment is non-terminating, so we always
+# expect it to be ``False``.
+#
+
+
+def _reset(self, tensordict):
+    if tensordict is None or tensordict.is_empty():
+        # if no ``tensordict`` is passed, we generate a single set of hyperparameters
+        # Otherwise, we assume that the input ``tensordict`` contains all the relevant
+        # parameters to get started.
+        tensordict = self.gen_params(batch_size=self.batch_size)
+
+    high_th = torch.tensor(DEFAULT_X, device=self.device)
+    high_thdot = torch.tensor(DEFAULT_Y, device=self.device)
+    low_th = -high_th
+    low_thdot = -high_thdot
+
+    # for non batch-locked environments, the input ``tensordict`` shape dictates the number
+    # of simulators run simultaneously. In other contexts, the initial
+    # random state's shape will depend upon the environment batch-size instead.
+    th = (
+        torch.rand(tensordict.shape, generator=self.rng, device=self.device)
+        * (high_th - low_th)
+        + low_th
+    )
+    thdot = (
+        torch.rand(tensordict.shape, generator=self.rng, device=self.device)
+        * (high_thdot - low_thdot)
+        + low_thdot
+    )
+    out = TensorDict(
+        {
+            "th": th,
+            "thdot": thdot,
+            "params": tensordict["params"],
+        },
+        batch_size=tensordict.shape,
+    )
+    return out
+
+
+######################################################################
+# Environment metadata: ``env.*_spec``
+# ------------------------------------
+#
+# The specs define the input and output domain of the environment.
+# It is important that the specs accurately define the tensors that will be
+# received at runtime, as they are often used to carry information about
+# environments in multiprocessing and distributed settings. They can also be
+# used to instantiate lazily defined neural networks and test scripts without
+# actually querying the environment (which can be costly with real-world
+# physical systems for instance).
+#
+# There are four specs that we must code in our environment:
+#
+# * :obj:`EnvBase.observation_spec`: This will be a :class:`~torchrl.data.CompositeSpec`
+#   instance where each key is an observation (a :class:`CompositeSpec` can be
+#   viewed as a dictionary of specs).
+# * :obj:`EnvBase.action_spec`: It can be any type of spec, but it is required
+#   that it corresponds to the ``"action"`` entry in the input ``tensordict``;
+# * :obj:`EnvBase.reward_spec`: provides information about the reward space;
+# * :obj:`EnvBase.done_spec`: provides information about the space of the done
+#   flag.
+#
+# TorchRL specs are organized in two general containers: ``input_spec`` which
+# contains the specs of the information that the step function reads (divided
+# between ``action_spec`` containing the action and ``state_spec`` containing
+# all the rest), and ``output_spec`` which encodes the specs that the
+# step outputs (``observation_spec``, ``reward_spec`` and ``done_spec``).
+# In general, you should not interact directly with ``output_spec`` and
+# ``input_spec`` but only with their content: ``observation_spec``,
+# ``reward_spec``, ``done_spec``, ``action_spec`` and ``state_spec``.
+# The reason if that the specs are organized in a non-trivial way
+# within ``output_spec`` and
+# ``input_spec`` and neither of these should be directly modified.
+#
+# In other words, the ``observation_spec`` and related properties are
+# convenient shortcuts to the content of the output and input spec containers.
+#
+# TorchRL offers multiple :class:`~torchrl.data.TensorSpec`
+# `subclasses <https://pytorch.org/rl/stable/reference/data.html#tensorspec>`_ to
+# encode the environment's input and output characteristics.
+#
+# Specs shape
+# ^^^^^^^^^^^
+#
+# The environment specs leading dimensions must match the
+# environment batch-size. This is done to enforce that every component of an
+# environment (including its transforms) have an accurate representation of
+# the expected input and output shapes. This is something that should be
+# accurately coded in stateful settings.
+#
+# For non batch-locked environments, such as the one in our example (see below),
+# this is irrelevant as the environment batch size will most likely be empty.
+#
+
+
+def _make_spec(self, td_params):
+    # Under the hood, this will populate self.output_spec["observation"]
+    self.observation_spec = CompositeSpec(
+        th=BoundedTensorSpec(
+            low=-torch.pi,
+            high=torch.pi,
+            shape=(),
+            dtype=torch.float32,
+        ),
+        thdot=BoundedTensorSpec(
+            low=-td_params["params", "max_speed"],
+            high=td_params["params", "max_speed"],
+            shape=(),
+            dtype=torch.float32,
+        ),
+        # we need to add the ``params`` to the observation specs, as we want
+        # to pass it at each step during a rollout
+        params=make_composite_from_td(td_params["params"]),
+        shape=(),
+    )
+    # since the environment is stateless, we expect the previous output as input.
+    # For this, ``EnvBase`` expects some state_spec to be available
+    self.state_spec = self.observation_spec.clone()
+    # action-spec will be automatically wrapped in input_spec when
+    # `self.action_spec = spec` will be called supported
+    self.action_spec = BoundedTensorSpec(
+        low=-td_params["params", "max_torque"],
+        high=td_params["params", "max_torque"],
+        shape=(1,),
+        dtype=torch.float32,
+    )
+    self.reward_spec = UnboundedContinuousTensorSpec(shape=(*td_params.shape, 1))
+
+
+def make_composite_from_td(td):
+    # custom function to convert a ``tensordict`` in a similar spec structure
+    # of unbounded values.
+    composite = CompositeSpec(
+        {
+            key: make_composite_from_td(tensor)
+            if isinstance(tensor, TensorDictBase)
+            else UnboundedContinuousTensorSpec(
+                dtype=tensor.dtype, device=tensor.device, shape=tensor.shape
+            )
+            for key, tensor in td.items()
+        },
+        shape=td.shape,
+    )
+    return composite
+
+
+######################################################################
+# Reproducible experiments: seeding
+# ---------------------------------
+#
+# Seeding an environment is a common operation when initializing an experiment.
+# The only goal of :func:`EnvBase._set_seed` is to set the seed of the contained
+# simulator. If possible, this operation should not call ``reset()`` or interact
+# with the environment execution. The parent :func:`EnvBase.set_seed` method
+# incorporates a mechanism that allows seeding multiple environments with a
+# different pseudo-random and reproducible seed.
+#
+
+
+def _set_seed(self, seed: Optional[int]):
+    rng = torch.manual_seed(seed)
+    self.rng = rng
+
+
+######################################################################
+# Wrapping things together: the :class:`~torchrl.envs.EnvBase` class
+# ------------------------------------------------------------------
+#
+# We can finally put together the pieces and design our environment class.
+# The specs initialization needs to be performed during the environment
+# construction, so we must take care of calling the :func:`_make_spec` method
+# within :func:`PendulumEnv.__init__`.
+#
+# We add a static method :meth:`PendulumEnv.gen_params` which deterministically
+# generates a set of hyperparameters to be used during execution:
+#
+
+
+def gen_params(g=10.0, batch_size=None) -> TensorDictBase:
+    """Returns a ``tensordict`` containing the physical parameters such as gravitational force and torque or speed limits."""
+    if batch_size is None:
+        batch_size = []
+    td = TensorDict(
+        {
+            "params": TensorDict(
+                {
+                    "max_speed": 8,
+                    "max_torque": 2.0,
+                    "dt": 0.05,
+                    "g": g,
+                    "m": 1.0,
+                    "l": 1.0,
+                },
+                [],
+            )
+        },
+        [],
+    )
+    if batch_size:
+        td = td.expand(batch_size).contiguous()
+    return td
+
+
+######################################################################
+# We define the environment as non-``batch_locked`` by turning the ``homonymous``
+# attribute to ``False``. This means that we will **not** enforce the input
+# ``tensordict`` to have a ``batch-size`` that matches the one of the environment.
+#
+# The following code will just put together the pieces we have coded above.
+#
+
+
+class PendulumEnv(EnvBase):
+    metadata = {
+        "render_modes": ["human", "rgb_array"],
+        "render_fps": 30,
+    }
+    batch_locked = False
+
+    def __init__(self, td_params=None, seed=None, device="cpu"):
+        if td_params is None:
+            td_params = self.gen_params()
+
+        super().__init__(device=device, batch_size=[])
+        self._make_spec(td_params)
+        if seed is None:
+            seed = torch.empty((), dtype=torch.int64).random_().item()
+        self.set_seed(seed)
+
+    # Helpers: _make_step and gen_params
+    gen_params = staticmethod(gen_params)
+    _make_spec = _make_spec
+
+    # Mandatory methods: _step, _reset and _set_seed
+    _reset = _reset
+    _step = staticmethod(_step)
+    _set_seed = _set_seed
+
+
+######################################################################
+# Testing our environment
+# -----------------------
+#
+# TorchRL provides a simple function :func:`~torchrl.envs.utils.check_env_specs`
+# to check that a (transformed) environment has an input/output structure that
+# matches the one dictated by its specs.
+# Let us try it out:
+#
+
+env = PendulumEnv()
+check_env_specs(env)
+
+######################################################################
+# We can have a look at our specs to have a visual representation of the environment
+# signature:
+#
+
+print("observation_spec:", env.observation_spec)
+print("state_spec:", env.state_spec)
+print("reward_spec:", env.reward_spec)
+
+######################################################################
+# We can execute a couple of commands too to check that the output structure
+# matches what is expected.
+
+td = env.reset()
+print("reset tensordict", td)
+
+######################################################################
+# We can run the :func:`env.rand_step` to generate
+# an action randomly from the ``action_spec`` domain. A ``tensordict`` containing
+# the hyperparameters and the current state **must** be passed since our
+# environment is stateless. In stateful contexts, ``env.rand_step()`` works
+# perfectly too.
+#
+td = env.rand_step(td)
+print("random step tensordict", td)
+
+######################################################################
+# Transforming an environment
+# ---------------------------
+#
+# Writing environment transforms for stateless simulators is slightly more
+# complicated than for stateful ones: transforming an output entry that needs
+# to be read at the following iteration requires to apply the inverse transform
+# before calling :func:`meth.step` at the next step.
+# This is an ideal scenario to showcase all the features of TorchRL's
+# transforms!
+#
+# For instance, in the following transformed environment we ``unsqueeze`` the entries
+# ``["th", "thdot"]`` to be able to stack them along the last
+# dimension. We also pass them as ``in_keys_inv`` to squeeze them back to their
+# original shape once they are passed as input in the next iteration.
+#
+env = TransformedEnv(
+    env,
+    # ``Unsqueeze`` the observations that we will concatenate
+    UnsqueezeTransform(
+        dim=-1,
+        in_keys=["th", "thdot"],
+        in_keys_inv=["th", "thdot"],
+    ),
+)
+
+######################################################################
+# Writing custom transforms
+# ^^^^^^^^^^^^^^^^^^^^^^^^^
+#
+# TorchRL's transforms may not cover all the operations one wants to execute
+# after an environment has been executed.
+# Writing a transform does not require much effort. As for the environment
+# design, there are two steps in writing a transform:
+#
+# - Getting the dynamics right (forward and inverse);
+# - Adapting the environment specs.
+#
+# A transform can be used in two settings: on its own, it can be used as a
+# :class:`~torch.nn.Module`. It can also be used appended to a
+# :class:`~torchrl.envs.transforms.TransformedEnv`. The structure of the class allows to
+# customize the behavior in the different contexts.
+#
+# A :class:`~torchrl.envs.transforms.Transform` skeleton can be summarized as follows:
+#
+# .. code-block::
+#
+#   class Transform(nn.Module):
+#       def forward(self, tensordict):
+#           ...
+#       def _apply_transform(self, tensordict):
+#           ...
+#       def _step(self, tensordict):
+#           ...
+#       def _call(self, tensordict):
+#           ...
+#       def inv(self, tensordict):
+#           ...
+#       def _inv_apply_transform(self, tensordict):
+#           ...
+#
+# There are three entry points (:func:`forward`, :func:`_step` and :func:`inv`)
+# which all receive :class:`tensordict.TensorDict` instances. The first two
+# will eventually go through the keys indicated by :obj:`~tochrl.envs.transforms.Transform.in_keys`
+# and call :meth:`~torchrl.envs.transforms.Transform._apply_transform` to each of these. The results will
+# be written in the entries pointed by :obj:`Transform.out_keys` if provided
+# (if not the ``in_keys`` will be updated with the transformed values).
+# If inverse transforms need to be executed, a similar data flow will be
+# executed but with the :func:`Transform.inv` and
+# :func:`Transform._inv_apply_transform` methods and across the ``in_keys_inv``
+# and ``out_keys_inv`` list of keys.
+# The following figure summarized this flow for environments and replay
+# buffers.
+#
+#
+#    Transform API
+#
+# In some cases, a transform will not work on a subset of keys in a unitary
+# manner, but will execute some operation on the parent environment or
+# work with the entire input ``tensordict``.
+# In those cases, the :func:`_call` and :func:`forward` methods should be
+# re-written, and the :func:`_apply_transform` method can be skipped.
+#
+# Let us code new transforms that will compute the ``sine`` and ``cosine``
+# values of the position angle, as these values are more useful to us to learn
+# a policy than the raw angle value:
+
+
+class SinTransform(Transform):
+    def _apply_transform(self, obs: torch.Tensor) -> None:
+        return obs.sin()
+
+    # The transform must also modify the data at reset time
+    def _reset(
+        self, tensordict: TensorDictBase, tensordict_reset: TensorDictBase
+    ) -> TensorDictBase:
+        return self._call(tensordict_reset)
+
+    # _apply_to_composite will execute the observation spec transform across all
+    # in_keys/out_keys pairs and write the result in the observation_spec which
+    # is of type ``Composite``
+    @_apply_to_composite
+    def transform_observation_spec(self, observation_spec):
+        return BoundedTensorSpec(
+            low=-1,
+            high=1,
+            shape=observation_spec.shape,
+            dtype=observation_spec.dtype,
+            device=observation_spec.device,
+        )
+
+
+class CosTransform(Transform):
+    def _apply_transform(self, obs: torch.Tensor) -> None:
+        return obs.cos()
+
+    # The transform must also modify the data at reset time
+    def _reset(
+        self, tensordict: TensorDictBase, tensordict_reset: TensorDictBase
+    ) -> TensorDictBase:
+        return self._call(tensordict_reset)
+
+    # _apply_to_composite will execute the observation spec transform across all
+    # in_keys/out_keys pairs and write the result in the observation_spec which
+    # is of type ``Composite``
+    @_apply_to_composite
+    def transform_observation_spec(self, observation_spec):
+        return BoundedTensorSpec(
+            low=-1,
+            high=1,
+            shape=observation_spec.shape,
+            dtype=observation_spec.dtype,
+            device=observation_spec.device,
+        )
+
+
+t_sin = SinTransform(in_keys=["th"], out_keys=["sin"])
+t_cos = CosTransform(in_keys=["th"], out_keys=["cos"])
+env.append_transform(t_sin)
+env.append_transform(t_cos)
+
+######################################################################
+# Concatenates the observations onto an "observation" entry.
+# ``del_keys=False`` ensures that we keep these values for the next
+# iteration.
+cat_transform = CatTensors(
+    in_keys=["sin", "cos", "thdot"], dim=-1, out_key="observation", del_keys=False
+)
+env.append_transform(cat_transform)
+
+######################################################################
+# Once more, let us check that our environment specs match what is received:
+check_env_specs(env)
+
+######################################################################
+# Executing a rollout
+# -------------------
+#
+# Executing a rollout is a succession of simple steps:
+#
+# * reset the environment
+# * while some condition is not met:
+#
+#   * compute an action given a policy
+#   * execute a step given this action
+#   * collect the data
+#   * make a ``MDP`` step
+#
+# * gather the data and return
+#
+# These operations have been conveniently wrapped in the :meth:`~torchrl.envs.EnvBase.rollout`
+# method, from which we provide a simplified version here below.
+
+
+def simple_rollout(steps=100):
+    # preallocate:
+    data = TensorDict({}, [steps])
+    # reset
+    _data = env.reset()
+    for i in range(steps):
+        _data["action"] = env.action_spec.rand()
+        _data = env.step(_data)
+        data[i] = _data
+        _data = step_mdp(_data, keep_other=True)
+    return data
+
+
+print("data from rollout:", simple_rollout(100))
+
+######################################################################
+# Batching computations
+# ---------------------
+#
+# The last unexplored end of our tutorial is the ability that we have to
+# batch computations in TorchRL. Because our environment does not
+# make any assumptions regarding the input data shape, we can seamlessly
+# execute it over batches of data. Even better: for non-batch-locked
+# environments such as our Pendulum, we can change the batch size on the fly
+# without recreating the environment.
+# To do this, we just generate parameters with the desired shape.
+#
+
+batch_size = 10  # number of environments to be executed in batch
+td = env.reset(env.gen_params(batch_size=[batch_size]))
+print("reset (batch size of 10)", td)
+td = env.rand_step(td)
+print("rand step (batch size of 10)", td)
+
+######################################################################
+# Executing a rollout with a batch of data requires us to reset the environment
+# out of the rollout function, since we need to define the batch_size
+# dynamically and this is not supported by :meth:`~torchrl.envs.EnvBase.rollout`:
+#
+
+rollout = env.rollout(
+    3,
+    auto_reset=False,  # we're executing the reset out of the ``rollout`` call
+    tensordict=env.reset(env.gen_params(batch_size=[batch_size])),
+)
+print("rollout of len 3 (batch size of 10):", rollout)
+
+
+######################################################################
+# Training a simple policy
+# ------------------------
+#
+# In this example, we will train a simple policy using the reward as a
+# differentiable objective, such as a negative loss.
+# We will take advantage of the fact that our dynamic system is fully
+# differentiable to backpropagate through the trajectory return and adjust the
+# weights of our policy to maximize this value directly. Of course, in many
+# settings many of the assumptions we make do not hold, such as
+# differentiable system and full access to the underlying mechanics.
+#
+# Still, this is a very simple example that showcases how a training loop can
+# be coded with a custom environment in TorchRL.
+#
+# Let us first write the policy network:
+#
+torch.manual_seed(0)
+env.set_seed(0)
+
+net = nn.Sequential(
+    nn.LazyLinear(64),
+    nn.Tanh(),
+    nn.LazyLinear(64),
+    nn.Tanh(),
+    nn.LazyLinear(64),
+    nn.Tanh(),
+    nn.LazyLinear(1),
+)
+policy = TensorDictModule(
+    net,
+    in_keys=["observation"],
+    out_keys=["action"],
+)
+
+######################################################################
+# and our optimizer:
+#
+
+optim = torch.optim.Adam(policy.parameters(), lr=2e-3)
+
+######################################################################
+# Training loop
+# ^^^^^^^^^^^^^
+#
+# We will successively:
+#
+# * generate a trajectory
+# * sum the rewards
+# * backpropagate through the graph defined by these operations
+# * clip the gradient norm and make an optimization step
+# * repeat
+#
+# At the end of the training loop, we should have a final reward close to 0
+# which demonstrates that the pendulum is upward and still as desired.
+#
+batch_size = 32
+pbar = tqdm.tqdm(range(20_000 // batch_size))
+scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optim, 20_000)
+logs = defaultdict(list)
+
+for _ in pbar:
+    init_td = env.reset(env.gen_params(batch_size=[batch_size]))
+    rollout = env.rollout(100, policy, tensordict=init_td, auto_reset=False)
+    traj_return = rollout["next", "reward"].mean()
+    (-traj_return).backward()
+    gn = torch.nn.utils.clip_grad_norm_(net.parameters(), 1.0)
+    optim.step()
+    optim.zero_grad()
+    pbar.set_description(
+        f"reward: {traj_return: 4.4f}, "
+        f"last reward: {rollout[..., -1]['next', 'reward'].mean(): 4.4f}, gradient norm: {gn: 4.4}"
+    )
+    logs["return"].append(traj_return.item())
+    logs["last_reward"].append(rollout[..., -1]["next", "reward"].mean().item())
+    scheduler.step()
+
+
+def plot():
+    import matplotlib
+    from matplotlib import pyplot as plt
+
+    is_ipython = "inline" in matplotlib.get_backend()
+    if is_ipython:
+        from IPython import display
+
+    with plt.ion():
+        plt.figure(figsize=(10, 5))
+        plt.subplot(1, 2, 1)
+        plt.plot(logs["return"])
+        plt.title("returns")
+        plt.xlabel("iteration")
+        plt.subplot(1, 2, 2)
+        plt.plot(logs["last_reward"])
+        plt.title("last reward")
+        plt.xlabel("iteration")
+        if is_ipython:
+            display.display(plt.gcf())
+            display.clear_output(wait=True)
+        plt.show()
+
+
+plot()
+
+
+######################################################################
+# Conclusion
+# ----------
+#
+# In this tutorial, we have learned how to code a stateless environment from
+# scratch. We touched the subjects of:
+#
+# * The four essential components that need to be taken care of when coding
+#   an environment (``step``, ``reset``, seeding and building specs).
+#   We saw how these methods and classes interact with the
+#   :class:`~tensordict.TensorDict` class;
+# * How to test that an environment is properly coded using
+#   :func:`~torchrl.envs.utils.check_env_specs`;
+# * How to append transforms in the context of stateless environments and how
+#   to write custom transformations;
+# * How to train a policy on a fully differentiable simulator.
+#
diff --git a/advanced_source/privateuseone.rst b/advanced_source/privateuseone.rst
new file mode 100644
index 00000000000..5b5b37c20e2
--- /dev/null
+++ b/advanced_source/privateuseone.rst
@@ -0,0 +1,309 @@
+Facilitating New Backend Integration by PrivateUse1
+===================================================
+
+In this tutorial we will walk through some necessary steps to integrate a new backend
+living outside ``pytorch/pytorch`` repo by ``PrivateUse1``. Note that this tutorial assumes that
+you already have a basic understanding of PyTorch.
+you are an advanced user of PyTorch.
+
+.. note::
+
+   This tutorial only involves the parts related to the PrivateUse1 mechanism that facilitates the integration of new devices,
+   and other parts will not be covered. At the same time, not all the modules involved in this tutorial are required,
+   and you can choose the modules that are helpful to you according to your actual needs.
+
+
+What is PrivateUse1?
+--------------------
+
+Prior to Pytorch 2.0, PyTorch provided three reserved dispatch keys (and their corresponding Autograd keys)
+for prototyping out-of-tree backend extensions, the three dispatch keys are as follows:
+
+* ``PrivateUse1/AutogradPrivateUse1``
+* ``PrivateUse2/AutogradPrivateUse2``
+* ``PrivateUse3/AutogradPrivateUse3``
+
+After the prototype verification is passed, you can apply for a private key for the new backend, such as CUDA, XLA, MPS, and so on.
+
+However, with the rapid development of PyTorch, more and more hardware manufacturers are trying to
+integrate their backends into PyTorch, which might cause the following problems:
+
+* Every new backend integration involves a lot of file modification
+* There is currently a hard limit on the number of Dispatch Keys (``DispatchKeySet`` 64-bit limit)
+
+.. note::
+
+   There is also a problem with integrating the new backend into PyTorch through the PrivateUse1 Key, as it is impossible
+   to integrate many backends at the same time. Fortunately, these out-of-tree backends are rarely used simultaneously.
+
+
+In view of the above reasons, the community began to recommend new backend to be integrated
+into the PyTorch via ``PrivateUse1``.
+
+However, the previous ``PrivateUse1`` mechanism is not fully capable of integrating with the new backend, because it
+lacks some related support in certain modules, such as Storage, AMP, Distributed, and so on.
+
+With the arrival of Pytorch 2.1.0, a series of optimizations and enhancements have been made
+for ``PrivateUse1`` in terms of new backend integration, and it is now possible to support the integration
+of new devices rapidly and efficiently.
+
+How to integrate new backend via PrivateUse1
+--------------------------------------------
+
+In this section, we will discuss the details of integrating the new backend into Pytorch via ``PrivateUse1``,
+which mainly consists of the following parts:
+
+1. Register kernels for the new backend.
+2. Register generator for the new backend.
+3. Register device guard for the new backend.
+4. Register serialization and deserialization functions for new backend metadata.
+5. Other Modules.
+
+Register kernels for the new backend
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The new backend may have some high-performance implementations of operator, which can be registered to the dispatcher
+by ``TORCH_LIBRARY_IMPL`` API described in `Registering a Dispatched Operator in C++ <dispatcher>`_. This involves
+several situations:
+
+1. Register all the forward operators supported by the new backend to the dispatcher, and register the fallback
+   at the same time, so that when the new backend does not support some operators, these operators can fall back
+   to the CPU for execution to ensure the availability of functions.
+
+.. code-block:: cpp
+
+  at::Tensor wrapper_Custom_Tensor_add(const at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha) {
+    // Implementation of add kernel in new backend
+    ...
+  }
+
+  TORCH_LIBRARY_IMPL(aten, PrivateUse1, m) {
+    ...
+    m.impl("add.Tensor", TORCH_FN(wrapper_Custom_Tensor_add));
+    ...
+  }
+
+  void custom_cpu_fallback(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
+    // Add some hints about new devices that do not support and need to fall back to cpu
+    at::native::cpu_fallback(op, stack);
+  }
+
+  TORCH_LIBRARY_IMPL(_, PrivateUse1, m) {
+    m.fallback(torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  }
+
+2. Register kernels from ``torch::autograd::Function`` to the dispatcher by ``AutogradPrivateUse1``, if it is necessary for
+   new backend to override ``PyTorch Autograd layer``, the dispatcher and autograd system will automatically call the forward and
+   backward implementations of these operators.
+
+.. code-block:: cpp
+
+  class CumtomSeluFunction : public torch::autograd::Function<CumtomSeluFunction> {
+    // Implementation of selu kernel in new backend
+  }
+
+  at::Tensor wrapper_AutogradCumstom__selu(const at::Tensor & self) {
+    return CumtomSeluFunction::apply(self);
+  }
+
+  TORCH_LIBRARY_IMPL(aten, AutogradPrivateUse1, m) {
+    ...
+    m.impl("selu", TORCH_FN(wrapper_AutogradCustom__selu));
+    ...
+  }
+
+3. Register kernels which want to support `automatic mixed precision (AMP) <https://pytorch.org/docs/stable/amp.html>`_ and
+   fallback mechanism to the dispatcher by ``AutocastPrivateUse1``, the autocast system will automatically call these kernels when needed.
+
+.. code-block:: cpp
+
+  TORCH_LIBRARY_IMPL(aten, AutocastPrivateUse1, m) {
+    ...
+    KERNEL_PRIVATEUSEONE(<operator>, <policy>)
+    ...
+  }
+
+  TORCH_LIBRARY_IMPL(_, AutocastPrivateUse1, m) {
+    m.fallback(torch::CppFunction::makeFallthrough());
+  }
+
+What needs to be added is that if you want to support AMP in a new backend, you need to register a new ``BackendModule`` by
+``torch._register_device_module("backend_name", BackendModule)``, and the ``BackendModule`` needs to have the following APIs:
+
+* ``get_amp_supported_dtype() -> List[torch.dtype]``
+    get the supported dtypes on the new backend in AMP, which might support one more ``dtype``.
+* ``is_autocast_enabled() -> bool``
+    check the AMP is enabled or not on the new backend.
+* ``get_autocast_dtype() -> torch.dtype``
+    get the supported ``dtype`` on the new backend in AMP, which is set by ``set_autocast_dtype`` or the
+    default ``dtype``, and the default ``dtype`` is ``torch.float16``.
+* ``set_autocast_enabled(bool) -> None``
+    enable or disable AMP on the new backend.
+* ``set_autocast_dtype(dtype) -> None``
+    set the supported ``dtype`` on the new backend in AMP, and the ``dtype`` be contained in the ``dtypes`` got
+    from ``get_amp_supported_dtype``.
+
+Register generator for the new backend
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+It is necessary to support generators corresponding to new devices. Currently, ``PrivateUse1`` can dynamically
+register custom generators, which are mainly divided into the following steps.
+
+1. Inherit the ``GeneratorImpl`` class to implement the generator class corresponding to the new backend,
+   and implement various general methods.
+2. Define a new backend ``builder`` with a single parameter: ``device index``.
+3. Call ``REGISTER_GENERATOR_PRIVATEUSE1`` macro to complete dynamic registration.
+
+.. code-block:: cpp
+
+  struct CustomGeneratorImpl : public c10::GeneratorImpl {
+    // Implementation of generator in new backend
+  }
+
+  at::Generator make_custom_generator(c10::DeviceIndex device_index) {
+    return at::make_generator<CustomGeneratorImpl>(device_index);
+  }
+
+  REGISTER_GENERATOR_PRIVATEUSE1(make_cumstom_generator)
+
+Register device guard for the new backend
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+PyTorch provides functionalities related to device, stream, and event switching via ``DeviceGuard``.
+This function is also applicable to ``PrivateUse1`` Key.
+
+1. Inherit the ``DeviceGuardImplInterface`` class to implement the various general methods corresponding to the new backend.
+2. Call ``C10_REGISTER_GUARD_IMPL`` macro to complete dynamic registration.
+
+.. code-block:: cpp
+
+  struct CustomGuardImpl final : public c10::impl::DeviceGuardImplInterface {
+    // Implementation of guard in new backend
+  }
+
+  C10_REGISTER_GUARD_IMPL(PrivateUse1, CustomGuardImpl);
+
+Register serialization and deserialization functions for new backend metadata
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+PyTorch is currently able to dynamically register serialization/deserialization functions to support the serialization and deserialization
+of new backend additional metadata named ``backend_meta_`` in class ``TensorImpl.ExtraMeta``. You can refer to the following steps:
+
+1. Inherit the ``BackendMeta`` class to implement ``CustomBackendMetadata`` corresponding to the new backend and
+   various fields of the new backend can be customized in the class.
+2. Implement the serialization and deserialization functions of the new backend, the function signatures are 
+   ``void(const at::Tensor&, std::unordered_map<std::string, bool>&)``.
+3. Call the ``TensorBackendMetaRegistry`` macro to complete dynamic registration.
+
+.. code-block:: cpp
+
+  struct CustomBackendMetadata : public c10::BackendMeta {
+    // Implementation of backend metadata in new backend
+  }
+
+  void for_serialization(const at::Tensor& t, std::unordered_map<std::string, bool>& m) {
+    // Implementation of serialization
+  }
+
+  void for_deserialization(const at::Tensor& t, std::unordered_map<std::string, bool>& m) {
+    // Implementation of deserialization
+  }
+
+  TensorBackendMetaRegistry(c10::DeviceType::PrivateUse1, &for_serialization, &for_deserialization);
+
+Other Modules
+^^^^^^^^^^^^^
+
+In addition to the above-mentioned parts, there are some other modules that can be expanded through ``PrivateUse1``,
+such as ``distributed collective communication``, ``benchmark timer``, and others, which will be added in the future.
+One example about ``PrivateUse1`` integration is `Ascend NPU <https://github.com/ascend/pytorch>`_.
+
+
+How to Improve User Experience with Privateuse1
+-----------------------------------------------
+
+The primary goal of integrating new devices through ``PrivateUse1`` is to meet the basic functional requirements,
+and the next thing to do is to improve usability, which mainly involves the following aspects.
+
+1. Register new backend module to Pytorch.
+2. Rename PrivateUse1 to a custom name for the new backend.
+3. Generate methods and properties related to the new backend.
+
+Register new backend module to Pytorch
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Some CUDA-related interfaces in PyTorch can be called through the following form: ``torch.cuda.xxx``. Therefore, in order to
+comply with user habits, the new backend implemented through the ``PrivateUse1`` mechanism should also provide similar interfaces.
+
+For example, using ``Ascend NPU``:
+
+.. code-block:: python
+
+  torch._register_device_module('npu', torch_npu.npu)
+
+After doing the above operations, users can call some exclusive APIs of ``Ascend NPU`` through ``torch.npu.xxx``
+
+Rename PrivateUse1 to a custom name for the new backend
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+``PrivateUse1`` Key is the internal mechanism of the new backend integrated into PyTorch. For users, compared with ``PrivateUse1``,
+the custom name strongly related to the new backend should be more friendly.
+
+Taking the ``Ascend NPU`` as an example, the first usage will be more user-friendly.
+
+.. code-block:: python
+
+  torch.rand((2,2),device='npu:0')
+  torch.rand((2,2),device='privateuse1:0')
+
+Now, PyTorch provides a new C++/Python API for the self-named ``PrivateUse1`` backend, which is very simple to use.
+
+.. tab-set-code::
+
+  .. code-block:: python
+
+      torch.rename_privateuse1_backend("npu")
+
+  .. code-block:: C++
+
+      c10::register_privateuse1_backend("npu")
+
+Generate methods and properties related to the new backend
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+After renaming ``PrivateUse1`` to a custome name, automatically generate properties and methods related to the new backend name
+in the ``Tensor, nn, Storage`` modules for the new backend.
+
+Here is an example for ``Ascend NPU``:
+
+.. code-block:: python
+
+  torch.rename_privateuse1_backend("npu")
+  unsupported_dtype = [torch.quint8]
+  torch.utils.generate_methods_for_privateuse1_backend(for_tensor=True, for_module=True, for_storage=True, unsupported_dtype=unsupported_dtype)
+
+Then, you can use the following methods and properties:
+
+.. code-block:: python
+
+  torch.Tensor.npu()
+  torch.Tensor.is_npu
+  torch.Storage.npu()
+  torch.Storage.is_npu
+  ...
+
+Future Work
+-----------
+
+The improvement of the ``PrivateUse1`` mechanism is still in progress, so the integration method of ``PrivateUse1``
+of the new module will be added in turn. Here are a few items that we are actively working on:
+
+* Add the integration method of ``distributed collective communication``.
+* Add the integration method of ``benchmark timer``.
+
+Conclusion
+----------
+
+This tutorial walked you through the process of integrating new backends into PyTorch via ``PrivateUse1``, including but not limited to
+operator registration, generator registration, device guard registration, and so on. At the same time, some methods are introduced
+to improve the user experience.
diff --git a/advanced_source/python_custom_ops.py b/advanced_source/python_custom_ops.py
new file mode 100644
index 00000000000..1f20125f785
--- /dev/null
+++ b/advanced_source/python_custom_ops.py
@@ -0,0 +1,278 @@
+# -*- coding: utf-8 -*-
+
+"""
+.. _python-custom-ops-tutorial:
+
+Custom Python Operators
+=======================
+
+.. grid:: 2
+
+    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn
+       :class-card: card-prerequisites
+
+       * How to integrate custom operators written in Python with PyTorch
+       * How to test custom operators using ``torch.library.opcheck``
+
+    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites
+       :class-card: card-prerequisites
+
+       * PyTorch 2.4 or later
+
+PyTorch offers a large library of operators that work on Tensors (e.g.
+``torch.add``, ``torch.sum``, etc). However, you might wish to use a new customized
+operator with PyTorch, perhaps written by a third-party library. This tutorial
+shows how to wrap Python functions so that they behave like PyTorch native
+operators. Reasons why you may wish to create a custom operator in PyTorch include:
+
+- Treating an arbitrary Python function as an opaque callable with respect
+  to ``torch.compile`` (that is, prevent ``torch.compile`` from tracing
+  into the function).
+- Adding training support to an arbitrary Python function
+
+Use :func:`torch.library.custom_op` to create Python custom operators.
+Use the C++ ``TORCH_LIBRARY`` APIs to create C++ custom operators (these
+work in Python-less environments).
+See the `Custom Operators Landing Page <https://pytorch.org/tutorials/advanced/custom_ops_landing_page.html>`_
+for more details.
+
+Please note that if your operation can be expressed as a composition of
+existing PyTorch operators, then there is usually no need to use the custom operator
+API -- everything (for example ``torch.compile``, training support) should
+just work.
+"""
+######################################################################
+# Example: Wrapping PIL's crop into a custom operator
+# ------------------------------------
+# Let's say that we are using PIL's ``crop`` operation.
+
+import torch
+from torchvision.transforms.functional import to_pil_image, pil_to_tensor
+import PIL
+import IPython
+import matplotlib.pyplot as plt
+
+def crop(pic, box):
+    img = to_pil_image(pic.cpu())
+    cropped_img = img.crop(box)
+    return pil_to_tensor(cropped_img).to(pic.device) / 255.
+
+def display(img):
+    plt.imshow(img.numpy().transpose((1, 2, 0)))
+
+img = torch.ones(3, 64, 64)
+img *= torch.linspace(0, 1, steps=64) * torch.linspace(0, 1, steps=64).unsqueeze(-1)
+display(img)
+
+######################################################################
+
+cropped_img = crop(img, (10, 10, 50, 50))
+display(cropped_img)
+
+######################################################################
+# ``crop`` is not handled effectively out-of-the-box by
+# ``torch.compile``: ``torch.compile`` induces a
+# `"graph break" <https://pytorch.org/docs/stable/torch.compiler_faq.html#graph-breaks>`_
+# on functions it is unable to handle and graph breaks are bad for performance.
+# The following code demonstrates this by raising an error
+# (``torch.compile`` with ``fullgraph=True`` raises an error if a
+# graph break occurs).
+
+@torch.compile(fullgraph=True)
+def f(img):
+    return crop(img, (10, 10, 50, 50))
+
+# The following raises an error. Uncomment the line to see it.
+# cropped_img = f(img)
+
+######################################################################
+# In order to black-box ``crop`` for use with ``torch.compile``, we need to
+# do two things:
+#
+# 1. wrap the function into a PyTorch custom operator.
+# 2. add a "``FakeTensor`` kernel" (aka "meta kernel") to the operator.
+#    Given some ``FakeTensors`` inputs (dummy Tensors that don't have storage),
+#    this function should return dummy Tensors of your choice with the correct
+#    Tensor metadata (shape/strides/``dtype``/device).
+
+
+from typing import Sequence
+
+# Use torch.library.custom_op to define a new custom operator.
+# If your operator mutates any input Tensors, their names must be specified
+# in the ``mutates_args`` argument.
+@torch.library.custom_op("mylib::crop", mutates_args=())
+def crop(pic: torch.Tensor, box: Sequence[int]) -> torch.Tensor:
+    img = to_pil_image(pic.cpu())
+    cropped_img = img.crop(box)
+    return (pil_to_tensor(cropped_img) / 255.).to(pic.device, pic.dtype)
+
+# Use register_fake to add a ``FakeTensor`` kernel for the operator
+@crop.register_fake
+def _(pic, box):
+    channels = pic.shape[0]
+    x0, y0, x1, y1 = box
+    result = pic.new_empty(y1 - y0, x1 - x0, channels).permute(2, 0, 1)
+    # The result should have the same metadata (shape/strides/``dtype``/device)
+    # as running the ``crop`` function above.
+    return result
+
+######################################################################
+# After this, ``crop`` now works without graph breaks:
+
+@torch.compile(fullgraph=True)
+def f(img):
+    return crop(img, (10, 10, 50, 50))
+
+cropped_img = f(img)
+display(img)
+
+######################################################################
+
+display(cropped_img)
+
+######################################################################
+# Adding training support for crop
+# --------------------------------
+# Use ``torch.library.register_autograd`` to add training support for an operator.
+# Prefer this over directly using ``torch.autograd.Function``; some compositions of
+# ``autograd.Function`` with PyTorch operator registration APIs can lead to (and
+# has led to) silent incorrectness when composed with ``torch.compile``.
+#
+# If you don't need training support, there is no need to use
+# ``torch.library.register_autograd``.
+# If you end up training with a ``custom_op`` that doesn't have an autograd
+# registration, we'll raise an error message.
+#
+# The gradient formula for ``crop`` is essentially ``PIL.paste`` (we'll leave the
+# derivation as an exercise to the reader). Let's first wrap ``paste`` into a
+# custom operator:
+
+@torch.library.custom_op("mylib::paste", mutates_args=())
+def paste(im1: torch.Tensor, im2: torch.Tensor, coord: Sequence[int]) -> torch.Tensor:
+    assert im1.device == im2.device
+    assert im1.dtype == im2.dtype
+    im1_pil = to_pil_image(im1.cpu())
+    im2_pil = to_pil_image(im2.cpu())
+    PIL.Image.Image.paste(im1_pil, im2_pil, coord)
+    return (pil_to_tensor(im1_pil) / 255.).to(im1.device, im1.dtype)
+
+@paste.register_fake
+def _(im1, im2, coord):
+    assert im1.device == im2.device
+    assert im1.dtype == im2.dtype
+    return torch.empty_like(im1)
+
+######################################################################
+# And now let's use ``register_autograd`` to specify the gradient formula for ``crop``:
+
+def backward(ctx, grad_output):
+    grad_input = grad_output.new_zeros(ctx.pic_shape)
+    grad_input = paste(grad_input, grad_output, ctx.coords)
+    return grad_input, None
+
+def setup_context(ctx, inputs, output):
+    pic, box = inputs
+    ctx.coords = box[:2]
+    ctx.pic_shape = pic.shape
+
+crop.register_autograd(backward, setup_context=setup_context)
+
+######################################################################
+# Note that the backward must be a composition of PyTorch-understood operators,
+# which is why we wrapped paste into a custom operator instead of directly using
+# PIL's paste.
+
+img = img.requires_grad_()
+result = crop(img, (10, 10, 50, 50))
+result.sum().backward()
+display(img.grad)
+
+######################################################################
+# This is the correct gradient, with 1s (white) in the cropped region and 0s
+# (black) in the unused region.
+
+######################################################################
+# Testing Python Custom operators
+# -------------------------------
+# Use ``torch.library.opcheck`` to test that the custom operator was registered
+# correctly. This does not test that the gradients are mathematically correct;
+# please write separate tests for that (either manual ones or ``torch.autograd.gradcheck``).
+#
+# To use ``opcheck``, pass it a set of example inputs to test against. If your
+# operator supports training, then the examples should include Tensors that
+# require grad. If your operator supports multiple devices, then the examples
+# should include Tensors from each device.
+
+examples = [
+    [torch.randn(3, 64, 64), [0, 0, 10, 10]],
+    [torch.randn(3, 91, 91, requires_grad=True), [10, 0, 20, 10]],
+    [torch.randn(3, 60, 60, dtype=torch.double), [3, 4, 32, 20]],
+    [torch.randn(3, 512, 512, requires_grad=True, dtype=torch.double), [3, 4, 32, 45]],
+]
+
+for example in examples:
+    torch.library.opcheck(crop, example)
+
+######################################################################
+# Mutable Python Custom operators
+# -------------------------------
+# You can also wrap a Python function that mutates its inputs into a custom
+# operator.
+# Functions that mutate inputs are common because that is how many low-level
+# kernels are written; for example, a kernel that computes ``sin`` may take in
+# the input and an output tensor and write ``input.sin()`` to the output tensor.
+#
+# We'll use ``numpy.sin`` to demonstrate an example of a mutable Python
+# custom operator.
+
+import numpy as np
+
+@torch.library.custom_op("mylib::numpy_sin", mutates_args={"output"}, device_types="cpu")
+def numpy_sin(input: torch.Tensor, output: torch.Tensor) -> None:
+    assert input.device == output.device
+    assert input.device.type == "cpu"
+    input_np = input.numpy()
+    output_np = output.numpy()
+    np.sin(input_np, out=output_np)
+
+######################################################################
+# Because the operator doesn't return anything, there is no need to register
+# a ``FakeTensor`` kernel (meta kernel) to get it to work with ``torch.compile``.
+
+@torch.compile(fullgraph=True)
+def f(x):
+    out = torch.empty(3)
+    numpy_sin(x, out)
+    return out
+
+x = torch.randn(3)
+y = f(x)
+assert torch.allclose(y, x.sin())
+
+######################################################################
+# And here's an ``opcheck`` run telling us that we did indeed register the operator correctly.
+# ``opcheck`` would error out if we forgot to add the output to ``mutates_args``, for example.
+
+example_inputs = [
+    [torch.randn(3), torch.empty(3)],
+    [torch.randn(0, 3), torch.empty(0, 3)],
+    [torch.randn(1, 2, 3, 4, dtype=torch.double), torch.empty(1, 2, 3, 4, dtype=torch.double)],
+]
+
+for example in example_inputs:
+    torch.library.opcheck(numpy_sin, example)
+
+######################################################################
+# Conclusion
+# ----------
+# In this tutorial, we learned how to use ``torch.library.custom_op`` to
+# create a custom operator in Python that works with PyTorch subsystems
+# such as ``torch.compile`` and autograd.
+#
+# This tutorial provides a basic introduction to custom operators.
+# For more detailed information, see:
+#
+# - `the torch.library documentation <https://pytorch.org/docs/stable/library.html>`_
+# - `the Custom Operators Manual <https://pytorch.org/tutorials/advanced/custom_ops_landing_page.html#the-custom-operators-manual>`_
+#
diff --git a/advanced_source/rpc_ddp_tutorial.rst b/advanced_source/rpc_ddp_tutorial.rst
new file mode 100644
index 00000000000..5c7aeffb2f9
--- /dev/null
+++ b/advanced_source/rpc_ddp_tutorial.rst
@@ -0,0 +1,162 @@
+Combining Distributed DataParallel with Distributed RPC Framework
+=================================================================
+**Authors**: `Pritam Damania <https://github.com/pritamdamania87>`_ and `Yi Wang <https://github.com/wayi1>`_
+
+.. note::
+   |edit| View and edit this tutorial in `github <https://github.com/pytorch/tutorials/blob/main/advanced_source/rpc_ddp_tutorial.rst>`__.
+
+This tutorial uses a simple example to demonstrate how you can combine
+`DistributedDataParallel <https://pytorch.org/docs/stable/nn.html#torch.nn.parallel.DistributedDataParallel>`__ (DDP)
+with the `Distributed RPC framework <https://pytorch.org/docs/master/rpc.html>`__
+to combine distributed data parallelism with distributed model parallelism to
+train a simple model. Source code of the example can be found `here <https://github.com/pytorch/examples/tree/master/distributed/rpc/ddp_rpc>`__.
+
+Previous tutorials,
+`Getting Started With Distributed Data Parallel <https://pytorch.org/tutorials/intermediate/ddp_tutorial.html>`__
+and `Getting Started with Distributed RPC Framework <https://pytorch.org/tutorials/intermediate/rpc_tutorial.html>`__,
+described how to perform distributed data parallel and distributed model
+parallel training respectively. Although, there are several training paradigms
+where you might want to combine these two techniques. For example:
+
+1) If we have a model with a sparse part (large embedding table) and a dense
+   part (FC layers), we might want to put the embedding table on a parameter
+   server and replicate the FC layer across multiple trainers using `DistributedDataParallel <https://pytorch.org/docs/stable/nn.html#torch.nn.parallel.DistributedDataParallel>`__.
+   The `Distributed RPC framework <https://pytorch.org/docs/master/rpc.html>`__
+   can be used to perform embedding lookups on the parameter server.
+2) Enable hybrid parallelism as described in the `PipeDream <https://arxiv.org/abs/1806.03377>`__ paper.
+   We can use the `Distributed RPC framework <https://pytorch.org/docs/master/rpc.html>`__
+   to pipeline stages of the model across multiple workers and replicate each
+   stage (if needed) using `DistributedDataParallel <https://pytorch.org/docs/stable/nn.html#torch.nn.parallel.DistributedDataParallel>`__.
+
+|
+In this tutorial we will cover case 1 mentioned above. We have a total of 4
+workers in our setup as follows:
+
+
+1) 1 Master, which is responsible for creating an embedding table
+   (nn.EmbeddingBag) on the parameter server. The master also drives the
+   training loop on the two trainers.
+2) 1 Parameter Server, which basically holds the embedding table in memory and
+   responds to RPCs from the Master and Trainers.
+3) 2 Trainers, which store an FC layer (nn.Linear) which is replicated amongst
+   themselves using `DistributedDataParallel <https://pytorch.org/docs/stable/nn.html#torch.nn.parallel.DistributedDataParallel>`__.
+   The trainers are also responsible for executing the forward pass, backward
+   pass and optimizer step.
+
+|
+The entire training process is executed as follows:
+
+1) The master creates a `RemoteModule <https://pytorch.org/docs/master/rpc.html#remotemodule>`__
+   that holds an embedding table on the Parameter Server.
+2) The master, then kicks off the training loop on the trainers and passes the
+   remote module to the trainers.
+3) The trainers create a ``HybridModel`` which first performs an embedding lookup
+   using the remote module provided by the master and then executes the
+   FC layer which is wrapped inside DDP.
+4) The trainer executes the forward pass of the model and uses the loss to
+   execute the backward pass using `Distributed Autograd <https://pytorch.org/docs/master/rpc.html#distributed-autograd-framework>`__.
+5) As part of the backward pass, the gradients for the FC layer are computed
+   first and synced to all trainers via allreduce in DDP.
+6) Next, Distributed Autograd propagates the gradients to the parameter server,
+   where the gradients for the embedding table are updated.
+7) Finally, the `Distributed Optimizer <https://pytorch.org/docs/master/rpc.html#module-torch.distributed.optim>`__ is used to update all the parameters.
+
+
+.. attention::
+
+  You should always use `Distributed Autograd <https://pytorch.org/docs/master/rpc.html#distributed-autograd-framework>`__
+  for the backward pass if you're combining DDP and RPC.
+
+
+Now, let's go through each part in detail. Firstly, we need to setup all of our
+workers before we can perform any training. We create 4 processes such that
+ranks 0 and 1 are our trainers, rank 2 is the master and rank 3 is the
+parameter server.
+
+We initialize the RPC framework on all 4 workers using the TCP init_method.
+Once RPC initialization is done, the master creates a remote module that holds an `EmbeddingBag <https://pytorch.org/docs/master/generated/torch.nn.EmbeddingBag.html>`__
+layer on the Parameter Server using `RemoteModule <https://pytorch.org/docs/master/rpc.html#torch.distributed.nn.api.remote_module.RemoteModule>`__.
+The master then loops through each trainer and kicks off the training loop by
+calling ``_run_trainer`` on each trainer using `rpc_async <https://pytorch.org/docs/master/rpc.html#torch.distributed.rpc.rpc_async>`__.
+Finally, the master waits for all training to finish before exiting.
+
+The trainers first initialize a ``ProcessGroup`` for DDP with world_size=2
+(for two trainers) using `init_process_group <https://pytorch.org/docs/stable/distributed.html#torch.distributed.init_process_group>`__.
+Next, they initialize the RPC framework using the TCP init_method. Note that
+the ports are different in RPC initialization and ProcessGroup initialization.
+This is to avoid port conflicts between initialization of both frameworks.
+Once the initialization is done, the trainers just wait for the ``_run_trainer``
+RPC from the master.
+
+The parameter server just initializes the RPC framework and waits for RPCs from
+the trainers and master.
+
+
+.. literalinclude:: ../advanced_source/rpc_ddp_tutorial/main.py
+  :language: py
+  :start-after: BEGIN run_worker
+  :end-before: END run_worker
+
+Before we discuss details of the Trainer, let's introduce the ``HybridModel`` that
+the trainer uses. As described below, the ``HybridModel`` is initialized using a
+remote module that holds an embedding table (``remote_emb_module``) on the parameter server and the ``device``
+to use for DDP. The initialization of the model wraps an
+`nn.Linear <https://pytorch.org/docs/master/generated/torch.nn.Linear.html>`__
+layer inside DDP to replicate and synchronize this layer across all trainers.
+
+The forward method of the model is pretty straightforward. It performs an
+embedding lookup on the parameter server using RemoteModule's ``forward``
+and passes its output onto the FC layer.
+
+
+.. literalinclude:: ../advanced_source/rpc_ddp_tutorial/main.py
+  :language: py
+  :start-after: BEGIN hybrid_model
+  :end-before: END hybrid_model
+
+Next, let's look at the setup on the Trainer. The trainer first creates the
+``HybridModel`` described above using a remote module that holds the embedding table on the
+parameter server and its own rank.
+
+Now, we need to retrieve a list of RRefs to all the parameters that we would
+like to optimize with `DistributedOptimizer <https://pytorch.org/docs/master/rpc.html#module-torch.distributed.optim>`__.
+To retrieve the parameters for the embedding table from the parameter server,
+we can call RemoteModule's `remote_parameters <https://pytorch.org/docs/master/rpc.html#torch.distributed.nn.api.remote_module.RemoteModule.remote_parameters>`__,
+which basically walks through all the parameters for the embedding table and returns
+a list of RRefs. The trainer calls this method on the parameter server via RPC
+to receive a list of RRefs to the desired parameters. Since the
+DistributedOptimizer always takes a list of RRefs to parameters that need to
+be optimized, we need to create RRefs even for the local parameters for our
+FC layers. This is done by walking ``model.fc.parameters()``, creating an RRef for
+each parameter and appending it to the list returned from ``remote_parameters()``.
+Note that we cannnot use ``model.parameters()``,
+because it will recursively call ``model.remote_emb_module.parameters()``,
+which is not supported by ``RemoteModule``.
+
+Finally, we create our DistributedOptimizer using all the RRefs and define a
+CrossEntropyLoss function.
+
+.. literalinclude:: ../advanced_source/rpc_ddp_tutorial/main.py
+  :language: py
+  :start-after: BEGIN setup_trainer
+  :end-before: END setup_trainer
+
+Now we're ready to introduce the main training loop that is run on each trainer.
+``get_next_batch`` is just a helper function to generate random inputs and
+targets for training. We run the training loop for multiple epochs and for each
+batch:
+
+1) Setup a `Distributed Autograd Context <https://pytorch.org/docs/master/rpc.html#torch.distributed.autograd.context>`__
+   for Distributed Autograd.
+2) Run the forward pass of the model and retrieve its output.
+3) Compute the loss based on our outputs and targets using the loss function.
+4) Use Distributed Autograd to execute a distributed backward pass using the loss.
+5) Finally, run a Distributed Optimizer step to optimize all the parameters.
+
+.. literalinclude:: ../advanced_source/rpc_ddp_tutorial/main.py
+  :language: py
+  :start-after: BEGIN run_trainer
+  :end-before: END run_trainer
+.. code:: python
+
+Source code for the entire example can be found `here <https://github.com/pytorch/examples/tree/master/distributed/rpc/ddp_rpc>`__.
diff --git a/advanced_source/rpc_ddp_tutorial/main.py b/advanced_source/rpc_ddp_tutorial/main.py
new file mode 100644
index 00000000000..459741ff0f4
--- /dev/null
+++ b/advanced_source/rpc_ddp_tutorial/main.py
@@ -0,0 +1,185 @@
+import random
+
+import torch
+import torch.distributed as dist
+import torch.distributed.autograd as dist_autograd
+import torch.distributed.rpc as rpc
+import torch.multiprocessing as mp
+import torch.optim as optim
+from torch.distributed.nn import RemoteModule
+from torch.distributed.optim import DistributedOptimizer
+from torch.distributed.rpc import RRef
+from torch.distributed.rpc import TensorPipeRpcBackendOptions
+from torch.nn.parallel import DistributedDataParallel as DDP
+
+NUM_EMBEDDINGS = 100
+EMBEDDING_DIM = 16
+
+# BEGIN hybrid_model
+class HybridModel(torch.nn.Module):
+    r"""
+    The model consists of a sparse part and a dense part.
+    1) The dense part is an nn.Linear module that is replicated across all trainers using DistributedDataParallel.
+    2) The sparse part is a Remote Module that holds an nn.EmbeddingBag on the parameter server.
+    This remote model can get a Remote Reference to the embedding table on the parameter server.
+    """
+
+    def __init__(self, remote_emb_module, device):
+        super(HybridModel, self).__init__()
+        self.remote_emb_module = remote_emb_module
+        self.fc = DDP(torch.nn.Linear(16, 8).cuda(device), device_ids=[device])
+        self.device = device
+
+    def forward(self, indices, offsets):
+        emb_lookup = self.remote_emb_module.forward(indices, offsets)
+        return self.fc(emb_lookup.cuda(self.device))
+# END hybrid_model
+
+# BEGIN setup_trainer
+def _run_trainer(remote_emb_module, rank):
+    r"""
+    Each trainer runs a forward pass which involves an embedding lookup on the
+    parameter server and running nn.Linear locally. During the backward pass,
+    DDP is responsible for aggregating the gradients for the dense part
+    (nn.Linear) and distributed autograd ensures gradients updates are
+    propagated to the parameter server.
+    """
+
+    # Setup the model.
+    model = HybridModel(remote_emb_module, rank)
+
+    # Retrieve all model parameters as rrefs for DistributedOptimizer.
+
+    # Retrieve parameters for embedding table.
+    model_parameter_rrefs = model.remote_emb_module.remote_parameters()
+
+    # model.fc.parameters() only includes local parameters.
+    # NOTE: Cannot call model.parameters() here,
+    # because this will call remote_emb_module.parameters(),
+    # which supports remote_parameters() but not parameters().
+    for param in model.fc.parameters():
+        model_parameter_rrefs.append(RRef(param))
+
+    # Setup distributed optimizer
+    opt = DistributedOptimizer(
+        optim.SGD,
+        model_parameter_rrefs,
+        lr=0.05,
+    )
+
+    criterion = torch.nn.CrossEntropyLoss()
+    # END setup_trainer
+
+    # BEGIN run_trainer
+    def get_next_batch(rank):
+        for _ in range(10):
+            num_indices = random.randint(20, 50)
+            indices = torch.LongTensor(num_indices).random_(0, NUM_EMBEDDINGS)
+
+            # Generate offsets.
+            offsets = []
+            start = 0
+            batch_size = 0
+            while start < num_indices:
+                offsets.append(start)
+                start += random.randint(1, 10)
+                batch_size += 1
+
+            offsets_tensor = torch.LongTensor(offsets)
+            target = torch.LongTensor(batch_size).random_(8).cuda(rank)
+            yield indices, offsets_tensor, target
+
+    # Train for 100 epochs
+    for epoch in range(100):
+        # create distributed autograd context
+        for indices, offsets, target in get_next_batch(rank):
+            with dist_autograd.context() as context_id:
+                output = model(indices, offsets)
+                loss = criterion(output, target)
+
+                # Run distributed backward pass
+                dist_autograd.backward(context_id, [loss])
+
+                # Tun distributed optimizer
+                opt.step(context_id)
+
+                # Not necessary to zero grads as each iteration creates a different
+                # distributed autograd context which hosts different grads
+        print("Training done for epoch {}".format(epoch))
+        # END run_trainer
+
+# BEGIN run_worker
+def run_worker(rank, world_size):
+    r"""
+    A wrapper function that initializes RPC, calls the function, and shuts down
+    RPC.
+    """
+
+    # We need to use different port numbers in TCP init_method for init_rpc and
+    # init_process_group to avoid port conflicts.
+    rpc_backend_options = TensorPipeRpcBackendOptions()
+    rpc_backend_options.init_method = "tcp://localhost:29501"
+
+    # Rank 2 is master, 3 is ps and 0 and 1 are trainers.
+    if rank == 2:
+        rpc.init_rpc(
+            "master",
+            rank=rank,
+            world_size=world_size,
+            rpc_backend_options=rpc_backend_options,
+        )
+
+        remote_emb_module = RemoteModule(
+            "ps",
+            torch.nn.EmbeddingBag,
+            args=(NUM_EMBEDDINGS, EMBEDDING_DIM),
+            kwargs={"mode": "sum"},
+        )
+
+        # Run the training loop on trainers.
+        futs = []
+        for trainer_rank in [0, 1]:
+            trainer_name = "trainer{}".format(trainer_rank)
+            fut = rpc.rpc_async(
+                trainer_name, _run_trainer, args=(remote_emb_module, trainer_rank)
+            )
+            futs.append(fut)
+
+        # Wait for all training to finish.
+        for fut in futs:
+            fut.wait()
+    elif rank <= 1:
+        # Initialize process group for Distributed DataParallel on trainers.
+        dist.init_process_group(
+            backend="gloo", rank=rank, world_size=2, init_method="tcp://localhost:29500"
+        )
+
+        # Initialize RPC.
+        trainer_name = "trainer{}".format(rank)
+        rpc.init_rpc(
+            trainer_name,
+            rank=rank,
+            world_size=world_size,
+            rpc_backend_options=rpc_backend_options,
+        )
+
+        # Trainer just waits for RPCs from master.
+    else:
+        rpc.init_rpc(
+            "ps",
+            rank=rank,
+            world_size=world_size,
+            rpc_backend_options=rpc_backend_options,
+        )
+        # parameter server do nothing
+        pass
+
+    # block until all rpcs finish
+    rpc.shutdown()
+
+
+if __name__ == "__main__":
+    # 2 trainers, 1 parameter server, 1 master.
+    world_size = 4
+    mp.spawn(run_worker, args=(world_size,), nprocs=world_size, join=True)
+# END run_worker
diff --git a/advanced_source/semi_structured_sparse.py b/advanced_source/semi_structured_sparse.py
new file mode 100644
index 00000000000..e4bca79b9af
--- /dev/null
+++ b/advanced_source/semi_structured_sparse.py
@@ -0,0 +1,653 @@
+# -*- coding: utf-8 -*-
+"""
+(beta) Accelerating BERT with semi-structured (2:4) sparsity
+=====================================================
+**Author**: `Jesse Cai <https://github.com/jcaip>`_
+
+"""
+
+####################################################################
+# Overview
+# --------
+#
+# Like other forms of sparsity, **semi-structured sparsity** is a model
+# optimization technique that seeks to reduce the memory overhead and
+# latency of a neural network at the expense of some model accuracy. It is
+# also known as **fine-grained structured sparsity** or **2:4 structured
+# sparsity**.
+#
+# Semi-structured sparsity derives its name from its unique sparsity
+# pattern, where n out of every 2n elements are pruned. We most often see
+# n=2, hence 2:4 sparsity Semi-structured sparsity is particularly
+# interesting because it can be efficiently accelerated on GPUs and
+# doesn’t degrade model accuracy as much as other sparsity patterns.
+# 
+# With the introduction of
+# `semi-structured sparsity support <https://pytorch.org/docs/2.1/sparse.html#sparse-semi-structured-tensors>`_,
+# it is possible to prune and accelerate a semi-structured sparse model
+# without leaving PyTorch. We will explain this process in this tutorial.
+#
+# .. image:: ../../_static/img/pruning_flow.jpg
+# 
+# By the end of this tutorial, we will have sparsified a BERT
+# question-answering model to be 2:4 sparse, fine-tuning it to recover
+# nearly all F1 loss (86.92 dense vs 86.48 sparse). Finally, we will
+# accelerate this 2:4 sparse model for inference, yielding a 1.3x speedup.
+# 
+
+#####################################################
+# Requirements
+# ------------
+#
+# -  PyTorch >= 2.1.
+# -  A NVIDIA GPU with semi-structured sparsity support (Compute
+#    Capability 8.0+).
+#
+#  .. note:: This tutorial is tested on an NVIDIA A100 80GB GPU. You may not see similar speedups on newer GPU architectures, For the latest information on semi-structured sparsity support, please refer to the README `here <https://github.com/pytorch/ao/tree/main/torchao/sparsity#torchao-sparsity>
+#
+# This tutorial is designed for beginners to semi-structured sparsity and
+# sparsity in general. For users with existing 2:4 sparse models,
+# accelerating ``nn.Linear`` layers for inference with
+# ``to_sparse_semi_structured`` is quite straightforward. Here is an example: 
+# 
+
+import torch
+from torch.sparse import to_sparse_semi_structured, SparseSemiStructuredTensor
+from torch.utils.benchmark import Timer
+
+# mask Linear weight to be 2:4 sparse
+mask = torch.Tensor([0, 0, 1, 1]).tile((3072, 2560)).cuda().bool()
+linear = torch.nn.Linear(10240, 3072).half().cuda().eval()
+linear.weight = torch.nn.Parameter(mask * linear.weight)
+
+x = torch.rand(3072, 10240).half().cuda()
+
+with torch.inference_mode():
+    dense_output = linear(x)
+    dense_t = Timer(stmt="linear(x)",
+                    globals={"linear": linear,
+                             "x": x}).blocked_autorange().median * 1e3
+
+    # accelerate via SparseSemiStructuredTensor
+    linear.weight = torch.nn.Parameter(to_sparse_semi_structured(linear.weight))
+
+    sparse_output = linear(x)
+    sparse_t = Timer(stmt="linear(x)",
+                    globals={"linear": linear,
+                             "x": x}).blocked_autorange().median * 1e3
+
+    # sparse and dense matmul are numerically equivalent
+    # On an A100 80GB, we see: `Dense: 0.870ms Sparse: 0.630ms | Speedup: 1.382x`
+    assert torch.allclose(sparse_output, dense_output, atol=1e-3)
+    print(f"Dense: {dense_t:.3f}ms Sparse: {sparse_t:.3f}ms | Speedup: {(dense_t / sparse_t):.3f}x")
+
+
+######################################################################
+# What problem does semi-structured sparsity solve?
+# -------------------------------------------------
+# 
+# The general motivation behind sparsity is simple: if there are zeros in
+# your network, you can optimize efficiency by not storing or computing those
+# parameters. However, the specifics of sparsity are tricky. Zeroing out
+# parameters doesn’t affect the latency / memory overhead of our model out
+# of the box.
+# 
+# This is because the dense tensor still contains the pruned (zero)
+# elements, which the dense matrix multiplication kernel will still
+# operate on this elements. In order to realize performance gains, we need
+# to swap out dense kernels for sparse kernels, which skip calculation
+# involving pruned elements.
+# 
+# To do this, these kernels work on sparse matrices, which do not store
+# the pruned elements and store the specified elements in a compressed
+# format.
+# 
+# For semi-structured sparsity, we store exactly half of the original
+# parameters along with some compressed metadata about how the elements
+# were arranged.
+# 
+# .. image:: https://developer-blogs.nvidia.com/wp-content/uploads/2023/06/2-4-structured-sparsity-pattern.png
+#    :align: center :width: 80%
+# 
+#    Image sourced from `NVIDIA blog post <https://developer.nvidia.com/blog/structured-sparsity-in-the-nvidia-ampere-architecture-and-applications-in-search-engines/>`_ on semi-structured sparsity.
+# 
+# There are many different sparse layouts, each with their own benefits
+# and drawbacks. The 2:4 semi-structured sparse layout is particularly
+# interesting for two reasons:
+# 
+# * Unlike previous sparse formats,
+#   semi-structured sparsity was designed to be efficiently accelerated on
+#   GPUs. In 2020, NVIDIA introduced hardware support for semi-structured
+#   sparsity with their Ampere architecture, and have also released fast
+#   sparse kernels via
+#   CUTLASS `cuSPARSELt <https://docs.nvidia.com/cuda/cusparselt/index.html>`__.
+# 
+# * At the same time, semi-structured sparsity tends to have a milder
+#   impact on model accuracy compared to other sparse formats, especially
+#   when accounting for more advanced pruning / fine-tuning methods. NVIDIA
+#   has shown in their `white paper <https://arxiv.org/abs/2104.08378>`_
+#   that a simple paradigm of magnitude pruning once to be 2:4 sparse and
+#   then retraining the model yields nearly identical model accuracies.
+# 
+# Semi-structured exists in a sweet spot, providing a 2x (theoretical)
+# speedup at a much lower sparsity level (50%), while still being granular
+# enough to preserve model accuracy.
+# 
+# +---------------------+-------------+--------+------------+-------------+
+# | Network             | Data Set    | Metric | Dense FP16 | Sparse FP16 |
+# +=====================+=============+========+============+=============+
+# | ResNet-50           | ImageNet    | Top-1  | 76.1       | 76.2        |
+# +---------------------+-------------+--------+------------+-------------+
+# | ResNeXt-101_32x8d   | ImageNet    | Top-1  | 79.3       | 79.3        |
+# +---------------------+-------------+--------+------------+-------------+
+# | Xception            | ImageNet    | Top-1  | 79.2       | 79.2        |
+# +---------------------+-------------+--------+------------+-------------+
+# | SSD-RN50            | COCO2017    | bbAP   | 24.8       | 24.8        |
+# +---------------------+-------------+--------+------------+-------------+
+# | MaskRCNN-RN50       | COCO2017    | bbAP   | 37.9       | 37.9        |
+# +---------------------+-------------+--------+------------+-------------+
+# | FairSeq Transformer | EN-DE WMT14 | BLEU   | 28.2       | 28.5        |
+# +---------------------+-------------+--------+------------+-------------+
+# | BERT-Large          | SQuAD v1.1  | F1     | 91.9       | 91.9        |
+# +---------------------+-------------+--------+------------+-------------+
+# 
+# Semi-structured sparsity has an additional advantage from a workflow
+# perspective. Because the sparsity level is fixed at 50%, it is easier to
+# decompose the problem of sparsifying a model into two distinct
+# subproblems:
+# 
+# - Accuracy - How can we find a set of 2:4 sparse weights that minimize
+#   the accuracy degradation of our model?
+#
+# - Performance - How can we accelerate our 2:4 sparse weights for
+#   inference and reduced memory overhead?
+#
+
+##################################################################### 
+# .. math::
+# 
+#    \begin{bmatrix}
+#       1 & 1 & 0 & 0 \\
+#       0 & 0 & 1 & 1 \\
+#       1 & 0 & 0 & 0 \\
+#       0 & 0 & 1 & 1 \\
+#       \end{bmatrix}
+# 
+# The natural handoff point between these two problems are zeroed-out
+# dense tensors. Our inference solution is designed to compress and
+# accelerate tensors in this format. We anticipate many users coming up
+# with custom masking solution, as this is an active area of research.
+# 
+# Now that we’ve learned a little more about semi-structured sparsity,
+# let’s apply it to a BERT model trained on a question answering task,
+# SQuAD.
+# 
+# Intro & Setup
+# -------------
+# 
+# Let’s start by importing all the packages we need.
+# 
+
+# If you are running this in Google Colab, run:
+# .. code-block: python
+# 
+#    !pip install datasets transformers evaluate accelerate pandas
+#
+import os
+os.environ["WANDB_DISABLED"] = "true"
+
+import collections
+import datasets
+import evaluate
+import numpy as np
+import torch
+import torch.utils.benchmark as benchmark
+from torch import nn
+from torch.sparse import to_sparse_semi_structured, SparseSemiStructuredTensor
+from torch.ao.pruning import WeightNormSparsifier
+import transformers
+
+# force CUTLASS use if ``cuSPARSELt`` is not available
+torch.manual_seed(100)
+
+# Set default device to "cuda:0"
+torch.set_default_device(torch.device("cuda:0" if torch.cuda.is_available() else "cpu"))
+
+######################################################################
+# We’ll also need to define some helper functions that are specific to the
+# dataset / task at hand. These were adapted from
+# `this <https://huggingface.co/learn/nlp-course/chapter7/7?fw=pt>`__
+# Hugging Face course as a reference.
+# 
+
+def preprocess_validation_function(examples, tokenizer):
+    inputs = tokenizer(
+        [q.strip() for q in examples["question"]],
+        examples["context"],
+        max_length=384,
+        truncation="only_second",
+        return_overflowing_tokens=True,
+        return_offsets_mapping=True,
+        padding="max_length",
+    )
+    sample_map = inputs.pop("overflow_to_sample_mapping")
+    example_ids = []
+
+    for i in range(len(inputs["input_ids"])):
+        sample_idx = sample_map[i]
+        example_ids.append(examples["id"][sample_idx])
+        sequence_ids = inputs.sequence_ids(i)
+        offset = inputs["offset_mapping"][i]
+        inputs["offset_mapping"][i] = [
+            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
+        ]
+
+    inputs["example_id"] = example_ids
+    return inputs
+
+
+def preprocess_train_function(examples, tokenizer):
+    inputs = tokenizer(
+        [q.strip() for q in examples["question"]],
+        examples["context"],
+        max_length=384,
+        truncation="only_second",
+        return_offsets_mapping=True,
+        padding="max_length",
+    )
+
+    offset_mapping = inputs["offset_mapping"]
+    answers = examples["answers"]
+    start_positions = []
+    end_positions = []
+
+    for i, (offset, answer) in enumerate(zip(offset_mapping, answers)):
+        start_char = answer["answer_start"][0]
+        end_char = start_char + len(answer["text"][0])
+        sequence_ids = inputs.sequence_ids(i)
+
+        # Find the start and end of the context
+        idx = 0
+        while sequence_ids[idx] != 1:
+            idx += 1
+        context_start = idx
+        while sequence_ids[idx] == 1:
+            idx += 1
+        context_end = idx - 1
+
+        # If the answer is not fully inside the context, label it (0, 0)
+        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
+            start_positions.append(0)
+            end_positions.append(0)
+        else:
+            # Otherwise it's the start and end token positions
+            idx = context_start
+            while idx <= context_end and offset[idx][0] <= start_char:
+                idx += 1
+            start_positions.append(idx - 1)
+
+            idx = context_end
+            while idx >= context_start and offset[idx][1] >= end_char:
+                idx -= 1
+            end_positions.append(idx + 1)
+
+    inputs["start_positions"] = start_positions
+    inputs["end_positions"] = end_positions
+    return inputs
+
+
+def compute_metrics(start_logits, end_logits, features, examples):
+    n_best = 20
+    max_answer_length = 30
+    metric = evaluate.load("squad")
+
+    example_to_features = collections.defaultdict(list)
+    for idx, feature in enumerate(features):
+        example_to_features[feature["example_id"]].append(idx)
+
+    predicted_answers = []
+    # for example in ``tqdm`` (examples):
+    for example in examples:
+        example_id = example["id"]
+        context = example["context"]
+        answers = []
+
+        # Loop through all features associated with that example
+        for feature_index in example_to_features[example_id]:
+            start_logit = start_logits[feature_index]
+            end_logit = end_logits[feature_index]
+            offsets = features[feature_index]["offset_mapping"]
+
+            start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
+            end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
+            for start_index in start_indexes:
+                for end_index in end_indexes:
+                    # Skip answers that are not fully in the context
+                    if offsets[start_index] is None or offsets[end_index] is None:
+                        continue
+                    # Skip answers with a length that is either < 0
+                    # or > max_answer_length
+                    if (
+                        end_index < start_index
+                        or end_index - start_index + 1 > max_answer_length
+                    ):
+                        continue
+
+                    answer = {
+                        "text": context[
+                            offsets[start_index][0] : offsets[end_index][1]
+                        ],
+                        "logit_score": start_logit[start_index] + end_logit[end_index],
+                    }
+                    answers.append(answer)
+
+        # Select the answer with the best score
+        if len(answers) > 0:
+            best_answer = max(answers, key=lambda x: x["logit_score"])
+            predicted_answers.append(
+                {"id": example_id, "prediction_text": best_answer["text"]}
+            )
+        else:
+            predicted_answers.append({"id": example_id, "prediction_text": ""})
+
+    theoretical_answers = [
+        {"id": ex["id"], "answers": ex["answers"]} for ex in examples
+    ]
+    return metric.compute(predictions=predicted_answers, references=theoretical_answers)
+
+
+######################################################################
+# Now that those are defined, we just need one additional helper function,
+# which will help us benchmark our model.
+# 
+
+def measure_execution_time(model, batch_sizes, dataset):
+    dataset_for_model = dataset.remove_columns(["example_id", "offset_mapping"])
+    dataset_for_model.set_format("torch")
+    batch_size_to_time_sec = {}
+    for batch_size in batch_sizes:
+        batch = {
+            k: dataset_for_model[k][:batch_size].cuda()
+            for k in dataset_for_model.column_names
+        }
+
+        with torch.no_grad():
+            baseline_predictions = model(**batch)
+            timer = benchmark.Timer(
+                stmt="model(**batch)", globals={"model": model, "batch": batch}
+            )
+            p50 = timer.blocked_autorange().median * 1000
+            batch_size_to_time_sec[batch_size] = p50
+
+            model_c = torch.compile(model, fullgraph=True)
+            timer = benchmark.Timer(
+                stmt="model(**batch)", globals={"model": model_c, "batch": batch}
+            )
+            p50 = timer.blocked_autorange().median * 1000
+            batch_size_to_time_sec[f"{batch_size}_compile"] = p50
+            new_predictions = model_c(**batch)
+
+    return batch_size_to_time_sec
+
+
+
+######################################################################
+# We will get started by loading our model and tokenizer, and then setting
+# up our dataset.
+# 
+
+# load model
+model_name = "bert-base-cased"
+tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
+model = transformers.AutoModelForQuestionAnswering.from_pretrained(model_name)
+print(f"Loading tokenizer: {model_name}")
+print(f"Loading model: {model_name}")
+
+# set up train and val dataset
+squad_dataset = datasets.load_dataset("squad")
+tokenized_squad_dataset = {}
+tokenized_squad_dataset["train"] = squad_dataset["train"].map(
+    lambda x: preprocess_train_function(x, tokenizer), batched=True
+)
+tokenized_squad_dataset["validation"] = squad_dataset["validation"].map(
+    lambda x: preprocess_validation_function(x, tokenizer),
+    batched=True,
+    remove_columns=squad_dataset["train"].column_names,
+)
+data_collator = transformers.DataCollatorWithPadding(tokenizer=tokenizer)
+
+
+######################################################################
+# Establishing a baseline
+# =======================
+# 
+# Next, we’ll train a quick baseline of our model on SQuAD. This task asks
+# our model to identify spans, or segments of text, in a given context
+# (Wikipedia articles) that answer a given question. Running the following
+# code gives me an F1 score of 86.9. This is quite close to the reported
+# NVIDIA score and the difference is likely due to BERT-base
+# vs. BERT-large or fine-tuning hyperparameters.
+# 
+
+training_args = transformers.TrainingArguments(
+    "trainer",
+    num_train_epochs=1,
+    lr_scheduler_type="constant",
+    per_device_train_batch_size=32,
+    per_device_eval_batch_size=256,
+    logging_steps=50, 
+    # Limit max steps for tutorial runners. Delete the below line to see the reported accuracy numbers. 
+    max_steps=500,
+    report_to=None,
+)
+
+trainer = transformers.Trainer(
+    model,
+    training_args,
+    train_dataset=tokenized_squad_dataset["train"],
+    eval_dataset=tokenized_squad_dataset["validation"],
+    data_collator=data_collator,
+    tokenizer=tokenizer,
+)
+
+trainer.train()
+
+# batch sizes to compare for eval
+batch_sizes = [4, 16, 64, 256]
+# 2:4 sparsity require fp16, so we cast here for a fair comparison
+with torch.autocast("cuda"):
+    with torch.no_grad():
+        predictions = trainer.predict(tokenized_squad_dataset["validation"])
+        start_logits, end_logits = predictions.predictions
+        fp16_baseline = compute_metrics(
+            start_logits,
+            end_logits,
+            tokenized_squad_dataset["validation"],
+            squad_dataset["validation"],
+        )
+        fp16_time = measure_execution_time(
+            model,
+            batch_sizes,
+            tokenized_squad_dataset["validation"],
+        )
+
+print("fp16", fp16_baseline)
+print("cuda_fp16 time", fp16_time)
+
+import pandas as pd
+df = pd.DataFrame(trainer.state.log_history)
+df.plot.line(x='step', y='loss', title="Loss vs. # steps", ylabel="loss")
+
+
+######################################################################
+# Pruning BERT to be 2:4 sparse
+# -----------------------------
+# 
+# Now that we have our baseline, it’s time we prune BERT. There are many
+# different pruning strategies, but one of the most common is **magnitude
+# pruning**, which seeks to remove the weights with the lowest L1 norm.
+# Magnitude pruning was used by NVIDIA in all their results and is a
+# common baseline.
+# 
+# To do this, we will use the ``torch.ao.pruning`` package, which contains
+# a weight-norm (magnitude) sparsifier. These sparsifiers work by applying
+# mask parametrizations to the weight tensors in a model. This lets them
+# simulate sparsity by masking out the pruned weights.
+# 
+# We’ll also have to decide what layers of the model to apply sparsity to,
+# which in this case is all of the ``nn.Linear`` layers, except for the
+# task-specific head outputs. That’s because semi-structured sparsity has
+# `shape constraints <https://pytorch.org/docs/2.1/sparse.html#constructing-sparse-semi-structured-tensors>`_,
+# and the task-specific ``nn.Linear`` layers do not satisfy them.
+# 
+
+sparsifier = WeightNormSparsifier(
+    # apply sparsity to all blocks
+    sparsity_level=1.0,
+    # shape of 4 elements is a block
+    sparse_block_shape=(1, 4),
+    # two zeros for every block of 4
+    zeros_per_block=2
+)
+
+# add to config if ``nn.Linear`` and in the BERT model.
+sparse_config = [
+    {"tensor_fqn": f"{fqn}.weight"}
+    for fqn, module in model.named_modules()
+    if isinstance(module, nn.Linear) and "layer" in fqn
+]
+
+
+######################################################################
+# The first step for pruning the model is to insert parametrizations for
+# masking the weights of the model. This is done by the prepare step.
+# Anytime we try to access the ``.weight`` we will get ``mask * weight``
+# instead.
+# 
+
+# Prepare the model, insert fake-sparsity parametrizations for training
+sparsifier.prepare(model, sparse_config)
+print(model.bert.encoder.layer[0].output)
+
+
+######################################################################
+# Then, we’ll take a single pruning step. All pruners implement a
+# ``update_mask()`` method that updates the mask with the logic being
+# determined by the pruner implementation. The step method calls this
+# ``update_mask`` functions for the weights specified in the sparse
+# config.
+# 
+# We will also evaluate the model to show the accuracy degradation of
+# zero-shot pruning, or pruning without fine-tuning / retraining.
+# 
+
+sparsifier.step()
+with torch.autocast("cuda"):
+    with torch.no_grad():
+        predictions = trainer.predict(tokenized_squad_dataset["validation"])
+    pruned = compute_metrics(
+        *predictions.predictions,
+        tokenized_squad_dataset["validation"],
+        squad_dataset["validation"],
+    )
+print("pruned eval metrics:", pruned)
+
+
+######################################################################
+# In this state, we can start fine-tuning the model, updating the elements
+# that wouldn’t be pruned to better account for the accuracy loss. Once
+# we’ve reached a satisfied state, we can call ``squash_mask`` to fuse the
+# mask and the weight together. This will remove the parametrizations and
+# we are left with a zeroed-out 2:4 dense model.
+# 
+
+trainer.train()
+sparsifier.squash_mask()
+torch.set_printoptions(edgeitems=4)
+print(model.bert.encoder.layer[0].intermediate.dense.weight[:8, :8])
+
+df["sparse_loss"] = pd.DataFrame(trainer.state.log_history)["loss"]
+df.plot.line(x='step', y=["loss", "sparse_loss"], title="Loss vs. # steps", ylabel="loss")
+
+
+######################################################################
+# Accelerating 2:4 sparse models for inference
+# --------------------------------------------
+# 
+# Now that we have a model in this format, we can accelerate it for
+# inference just like in the QuickStart Guide.
+# 
+
+model = model.cuda().half()
+# accelerate for sparsity
+for fqn, module in model.named_modules():
+    if isinstance(module, nn.Linear) and "layer" in fqn:
+        module.weight = nn.Parameter(to_sparse_semi_structured(module.weight))
+
+with torch.no_grad():
+    predictions = trainer.predict(tokenized_squad_dataset["validation"])
+start_logits, end_logits = predictions.predictions
+metrics_sparse = compute_metrics(
+    start_logits,
+    end_logits,
+    tokenized_squad_dataset["validation"],
+    squad_dataset["validation"],
+)
+print("sparse eval metrics: ", metrics_sparse)
+sparse_perf = measure_execution_time(
+    model,
+    batch_sizes,
+    tokenized_squad_dataset["validation"],
+)
+print("sparse perf metrics: ", sparse_perf)
+
+
+######################################################################
+# Retraining our model after magnitude pruning has recovered nearly all of
+# the F1 that has been lost when the model was pruned. At the same time we
+# have achieved a 1.28x speedup for ``bs=16``. Note that not all shapes are
+# amenable to performance improvements. When batch sizes are small and
+# limited time is spent in compute sparse kernels may be slower than their
+# dense counterparts.
+# 
+# Because semi-structured sparsity is implemented as a tensor subclass, it
+# is compatible with ``torch.compile``. When composed with
+# ``to_sparse_semi_structured``, we are able to achieve a total 2x speedup
+# on BERT.
+#
+# .. table::
+#
+#     +--------------------+--------+--------------+-----------------+-----------+
+#     | Metrics            | fp16   | 2:4 sparse   | delta / speedup | compiled  |
+#     +====================+========+==============+=================+===========+
+#     | Exact Match (%)    | 78.53  | 78.44        | -0.09           |           |
+#     +--------------------+--------+--------------+-----------------+-----------+
+#     | F1 (%)             | 86.93  | 86.49        | -0.44           |           |
+#     +--------------------+--------+--------------+-----------------+-----------+
+#     | Time (bs=4)        | 11.10  | 15.54        | 0.71x           | no        |
+#     +--------------------+--------+--------------+-----------------+-----------+
+#     | Time (bs=16)       | 19.35  | 15.74        | 1.23x           | no        |
+#     +--------------------+--------+--------------+-----------------+-----------+
+#     | Time (bs=64)       | 72.71  | 59.41        | 1.22x           | no        |
+#     +--------------------+--------+--------------+-----------------+-----------+
+#     | Time (bs=256)      | 286.65 | 247.63       | 1.14x           | no        |
+#     +--------------------+--------+--------------+-----------------+-----------+
+#     | Time (bs=4)        | 7.59   | 7.46         | 1.02x           | yes       |
+#     +--------------------+--------+--------------+-----------------+-----------+
+#     | Time (bs=16)       | 11.47  | 9.68         | 1.18x           | yes       |
+#     +--------------------+--------+--------------+-----------------+-----------+
+#     | Time (bs=64)       | 41.57  | 36.92        | 1.13x           | yes       |
+#     +--------------------+--------+--------------+-----------------+-----------+
+#     | Time (bs=256)      | 159.22 | 142.23       | 1.12x           | yes       |
+#     +--------------------+--------+--------------+-----------------+-----------+
+# 
+# Conclusion
+# ==========
+# 
+# In this tutorial, we have shown how to prune BERT to be 2:4 sparse and
+# how to accelerate a 2:4 sparse model for inference. By taking advantage
+# of our ``SparseSemiStructuredTensor`` subclass, we were able to achieve a
+# 1.3x speedup over the fp16 baseline, and up to 2x with
+# ``torch.compile``. We also demonstrated the benefits of 2:4 sparsity by
+# fine-tuning BERT to recover any lost F1 (86.92 dense vs 86.48 sparse).
+# 
diff --git a/advanced_source/sharding.rst b/advanced_source/sharding.rst
new file mode 100644
index 00000000000..7dfeeb88bf1
--- /dev/null
+++ b/advanced_source/sharding.rst
@@ -0,0 +1,336 @@
+Exploring TorchRec sharding
+===========================
+
+This tutorial will mainly cover the sharding schemes of embedding tables
+via ``EmbeddingPlanner`` and ``DistributedModelParallel`` API and
+explore the benefits of different sharding schemes for the embedding
+tables by explicitly configuring them.
+
+Installation
+------------
+
+Requirements: - python >= 3.7
+
+We highly recommend CUDA when using torchRec. If using CUDA: - cuda >=
+11.0
+
+.. code:: python
+
+    # install conda to make installying pytorch with cudatoolkit 11.3 easier. 
+    !sudo rm Miniconda3-py37_4.9.2-Linux-x86_64.sh Miniconda3-py37_4.9.2-Linux-x86_64.sh.*
+    !sudo wget https://repo.anaconda.com/miniconda/Miniconda3-py37_4.9.2-Linux-x86_64.sh
+    !sudo chmod +x Miniconda3-py37_4.9.2-Linux-x86_64.sh
+    !sudo bash ./Miniconda3-py37_4.9.2-Linux-x86_64.sh -b -f -p /usr/local
+
+.. code:: python
+
+    # install pytorch with cudatoolkit 11.3
+    !sudo conda install pytorch cudatoolkit=11.3 -c pytorch-nightly -y
+
+Installing torchRec will also install
+`FBGEMM <https://github.com/pytorch/fbgemm>`__, a collection of CUDA
+kernels and GPU enabled operations to run
+
+.. code:: python
+
+    # install torchrec
+    !pip3 install torchrec-nightly
+
+Install multiprocess which works with ipython to for multi-processing
+programming within colab
+
+.. code:: python
+
+    !pip3 install multiprocess
+
+The following steps are needed for the Colab runtime to detect the added
+shared libraries. The runtime searches for shared libraries in /usr/lib,
+so we copy over the libraries which were installed in /usr/local/lib/.
+**This is a very necessary step, only in the colab runtime**.
+
+.. code:: python
+
+    !sudo cp /usr/local/lib/lib* /usr/lib/
+
+**Restart your runtime at this point for the newly installed packages
+to be seen.** Run the step below immediately after restarting so that
+python knows where to look for packages. **Always run this step after
+restarting the runtime.**
+
+.. code:: python
+
+    import sys
+    sys.path = ['', '/env/python', '/usr/local/lib/python37.zip', '/usr/local/lib/python3.7', '/usr/local/lib/python3.7/lib-dynload', '/usr/local/lib/python3.7/site-packages', './.local/lib/python3.7/site-packages']
+
+
+Distributed Setup
+-----------------
+
+Due to the notebook enviroment, we cannot run
+`SPMD <https://en.wikipedia.org/wiki/SPMD>`_ program here but we
+can do multiprocessing inside the notebook to mimic the setup. Users
+should be responsible for setting up their own
+`SPMD <https://en.wikipedia.org/wiki/SPMD>`_ launcher when using
+Torchrec. We setup our environment so that torch distributed based
+communication backend can work.
+
+.. code:: python
+
+    import os
+    import torch
+    import torchrec
+
+    os.environ["MASTER_ADDR"] = "localhost"
+    os.environ["MASTER_PORT"] = "29500"
+
+Constructing our embedding model
+--------------------------------
+
+Here we use TorchRec offering of
+`EmbeddingBagCollection <https://github.com/facebookresearch/torchrec/blob/main/torchrec/modules/embedding_modules.py#L59>`_
+to construct our embedding bag model with embedding tables.
+
+Here, we create an EmbeddingBagCollection (EBC) with four embedding
+bags. We have two types of tables: large tables and small tables
+differentiated by their row size difference: 4096 vs 1024. Each table is
+still represented by 64 dimension embedding.
+
+We configure the ``ParameterConstraints`` data structure for the tables,
+which provides hints for the model parallel API to help decide the
+sharding and placement strategy for the tables. In TorchRec, we support
+\* ``table-wise``: place the entire table on one device; \*
+``row-wise``: shard the table evenly by row dimension and place one
+shard on each device of the communication world; \* ``column-wise``:
+shard the table evenly by embedding dimension, and place one shard on
+each device of the communication world; \* ``table-row-wise``: special
+sharding optimized for intra-host communication for available fast
+intra-machine device interconnect, e.g. NVLink; \* ``data_parallel``:
+replicate the tables for every device;
+
+Note how we initially allocate the EBC on device "meta". This will tell
+EBC to not allocate memory yet.
+
+.. code:: python
+
+    from torchrec.distributed.planner.types import ParameterConstraints
+    from torchrec.distributed.embedding_types import EmbeddingComputeKernel
+    from torchrec.distributed.types import ShardingType
+    from typing import Dict
+
+    large_table_cnt = 2
+    small_table_cnt = 2
+    large_tables=[
+      torchrec.EmbeddingBagConfig(
+        name="large_table_" + str(i),
+        embedding_dim=64,
+        num_embeddings=4096,
+        feature_names=["large_table_feature_" + str(i)],
+        pooling=torchrec.PoolingType.SUM,
+      ) for i in range(large_table_cnt)
+    ]
+    small_tables=[
+      torchrec.EmbeddingBagConfig(
+        name="small_table_" + str(i),
+        embedding_dim=64,
+        num_embeddings=1024,
+        feature_names=["small_table_feature_" + str(i)],
+        pooling=torchrec.PoolingType.SUM,
+      ) for i in range(small_table_cnt)
+    ]
+
+    def gen_constraints(sharding_type: ShardingType = ShardingType.TABLE_WISE) -> Dict[str, ParameterConstraints]:
+      large_table_constraints = {
+        "large_table_" + str(i): ParameterConstraints(
+          sharding_types=[sharding_type.value],
+        ) for i in range(large_table_cnt)
+      }
+      small_table_constraints = {
+        "small_table_" + str(i): ParameterConstraints(
+          sharding_types=[sharding_type.value],
+        ) for i in range(small_table_cnt)
+      }
+      constraints = {**large_table_constraints, **small_table_constraints}
+      return constraints
+
+.. code:: python
+
+    ebc = torchrec.EmbeddingBagCollection(
+        device="cuda",
+        tables=large_tables + small_tables
+    )
+
+DistributedModelParallel in multiprocessing
+-------------------------------------------
+
+Now, we have a single process execution function for mimicking one
+rank's work during `SPMD <https://en.wikipedia.org/wiki/SPMD>`_
+execution.
+
+This code will shard the model collectively with other processes and
+allocate memories accordingly. It first sets up process groups and do
+embedding table placement using planner and generate sharded model using
+``DistributedModelParallel``.
+
+.. code:: python
+
+    def single_rank_execution(
+        rank: int,
+        world_size: int,
+        constraints: Dict[str, ParameterConstraints],
+        module: torch.nn.Module,
+        backend: str,
+    ) -> None:
+        import os
+        import torch
+        import torch.distributed as dist
+        from torchrec.distributed.embeddingbag import EmbeddingBagCollectionSharder
+        from torchrec.distributed.model_parallel import DistributedModelParallel
+        from torchrec.distributed.planner import EmbeddingShardingPlanner, Topology
+        from torchrec.distributed.types import ModuleSharder, ShardingEnv
+        from typing import cast
+
+        def init_distributed_single_host(
+            rank: int,
+            world_size: int,
+            backend: str,
+            # pyre-fixme[11]: Annotation `ProcessGroup` is not defined as a type.
+        ) -> dist.ProcessGroup:
+            os.environ["RANK"] = f"{rank}"
+            os.environ["WORLD_SIZE"] = f"{world_size}"
+            dist.init_process_group(rank=rank, world_size=world_size, backend=backend)
+            return dist.group.WORLD
+
+        if backend == "nccl":
+            device = torch.device(f"cuda:{rank}")
+            torch.cuda.set_device(device)
+        else:
+            device = torch.device("cpu")
+        topology = Topology(world_size=world_size, compute_device="cuda")
+        pg = init_distributed_single_host(rank, world_size, backend)
+        planner = EmbeddingShardingPlanner(
+            topology=topology,
+            constraints=constraints,
+        )
+        sharders = [cast(ModuleSharder[torch.nn.Module], EmbeddingBagCollectionSharder())]
+        plan: ShardingPlan = planner.collective_plan(module, sharders, pg)
+    
+        sharded_model = DistributedModelParallel(
+            module,
+            env=ShardingEnv.from_process_group(pg),
+            plan=plan,
+            sharders=sharders,
+            device=device,
+        )
+        print(f"rank:{rank},sharding plan: {plan}")
+        return sharded_model
+
+
+Multiprocessing Execution
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Now let's execute the code in multi-processes representing multiple GPU
+ranks.
+
+.. code:: python
+
+    import multiprocess
+       
+    def spmd_sharing_simulation(
+        sharding_type: ShardingType = ShardingType.TABLE_WISE,
+        world_size = 2,
+    ):
+      ctx = multiprocess.get_context("spawn")
+      processes = []
+      for rank in range(world_size):
+          p = ctx.Process(
+              target=single_rank_execution,
+              args=(
+                  rank,
+                  world_size,
+                  gen_constraints(sharding_type),
+                  ebc,
+                  "nccl"
+              ),
+          )
+          p.start()
+          processes.append(p)
+    
+      for p in processes:
+          p.join()
+          assert 0 == p.exitcode
+
+Table Wise Sharding
+~~~~~~~~~~~~~~~~~~~
+
+Now let's execute the code in two processes for 2 GPUs. We can see in
+the plan print that how our tables are sharded across GPUs. Each node
+will have one large table and one small which shows our planner tries
+for load balance for the embedding tables. Table-wise is the de-factor
+go-to sharding schemes for many small-medium size tables for load
+balancing over the devices.
+
+.. code:: python
+
+    spmd_sharing_simulation(ShardingType.TABLE_WISE)
+
+
+.. parsed-literal::
+
+    rank:1,sharding plan: {'': {'large_table_0': ParameterSharding(sharding_type='table_wise', compute_kernel='batched_fused', ranks=[0], sharding_spec=EnumerableShardingSpec(shards=[ShardMetadata(shard_offsets=[0, 0], shard_sizes=[4096, 64], placement=rank:0/cuda:0)])), 'large_table_1': ParameterSharding(sharding_type='table_wise', compute_kernel='batched_fused', ranks=[1], sharding_spec=EnumerableShardingSpec(shards=[ShardMetadata(shard_offsets=[0, 0], shard_sizes=[4096, 64], placement=rank:1/cuda:1)])), 'small_table_0': ParameterSharding(sharding_type='table_wise', compute_kernel='batched_fused', ranks=[0], sharding_spec=EnumerableShardingSpec(shards=[ShardMetadata(shard_offsets=[0, 0], shard_sizes=[1024, 64], placement=rank:0/cuda:0)])), 'small_table_1': ParameterSharding(sharding_type='table_wise', compute_kernel='batched_fused', ranks=[1], sharding_spec=EnumerableShardingSpec(shards=[ShardMetadata(shard_offsets=[0, 0], shard_sizes=[1024, 64], placement=rank:1/cuda:1)]))}}
+    rank:0,sharding plan: {'': {'large_table_0': ParameterSharding(sharding_type='table_wise', compute_kernel='batched_fused', ranks=[0], sharding_spec=EnumerableShardingSpec(shards=[ShardMetadata(shard_offsets=[0, 0], shard_sizes=[4096, 64], placement=rank:0/cuda:0)])), 'large_table_1': ParameterSharding(sharding_type='table_wise', compute_kernel='batched_fused', ranks=[1], sharding_spec=EnumerableShardingSpec(shards=[ShardMetadata(shard_offsets=[0, 0], shard_sizes=[4096, 64], placement=rank:1/cuda:1)])), 'small_table_0': ParameterSharding(sharding_type='table_wise', compute_kernel='batched_fused', ranks=[0], sharding_spec=EnumerableShardingSpec(shards=[ShardMetadata(shard_offsets=[0, 0], shard_sizes=[1024, 64], placement=rank:0/cuda:0)])), 'small_table_1': ParameterSharding(sharding_type='table_wise', compute_kernel='batched_fused', ranks=[1], sharding_spec=EnumerableShardingSpec(shards=[ShardMetadata(shard_offsets=[0, 0], shard_sizes=[1024, 64], placement=rank:1/cuda:1)]))}}
+
+Explore other sharding modes
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+We have initially explored what table-wise sharding would look like and
+how it balances the tables placement. Now we explore sharding modes with
+finer focus on load balance: row-wise. Row-wise is specifically
+addressing large tables which a single device cannot hold due to the
+memory size increase from large embedding row numbers. It can address
+the placement of the super large tables in your models. Users can see
+that in the ``shard_sizes`` section in the printed plan log, the tables
+are halved by row dimension to be distributed onto two GPUs.
+
+.. code:: python
+
+    spmd_sharing_simulation(ShardingType.ROW_WISE)
+
+
+.. parsed-literal::
+
+    rank:1,sharding plan: {'': {'large_table_0': ParameterSharding(sharding_type='row_wise', compute_kernel='batched_fused', ranks=[0, 1], sharding_spec=EnumerableShardingSpec(shards=[ShardMetadata(shard_offsets=[0, 0], shard_sizes=[2048, 64], placement=rank:0/cuda:0), ShardMetadata(shard_offsets=[2048, 0], shard_sizes=[2048, 64], placement=rank:1/cuda:1)])), 'large_table_1': ParameterSharding(sharding_type='row_wise', compute_kernel='batched_fused', ranks=[0, 1], sharding_spec=EnumerableShardingSpec(shards=[ShardMetadata(shard_offsets=[0, 0], shard_sizes=[2048, 64], placement=rank:0/cuda:0), ShardMetadata(shard_offsets=[2048, 0], shard_sizes=[2048, 64], placement=rank:1/cuda:1)])), 'small_table_0': ParameterSharding(sharding_type='row_wise', compute_kernel='batched_fused', ranks=[0, 1], sharding_spec=EnumerableShardingSpec(shards=[ShardMetadata(shard_offsets=[0, 0], shard_sizes=[512, 64], placement=rank:0/cuda:0), ShardMetadata(shard_offsets=[512, 0], shard_sizes=[512, 64], placement=rank:1/cuda:1)])), 'small_table_1': ParameterSharding(sharding_type='row_wise', compute_kernel='batched_fused', ranks=[0, 1], sharding_spec=EnumerableShardingSpec(shards=[ShardMetadata(shard_offsets=[0, 0], shard_sizes=[512, 64], placement=rank:0/cuda:0), ShardMetadata(shard_offsets=[512, 0], shard_sizes=[512, 64], placement=rank:1/cuda:1)]))}}
+    rank:0,sharding plan: {'': {'large_table_0': ParameterSharding(sharding_type='row_wise', compute_kernel='batched_fused', ranks=[0, 1], sharding_spec=EnumerableShardingSpec(shards=[ShardMetadata(shard_offsets=[0, 0], shard_sizes=[2048, 64], placement=rank:0/cuda:0), ShardMetadata(shard_offsets=[2048, 0], shard_sizes=[2048, 64], placement=rank:1/cuda:1)])), 'large_table_1': ParameterSharding(sharding_type='row_wise', compute_kernel='batched_fused', ranks=[0, 1], sharding_spec=EnumerableShardingSpec(shards=[ShardMetadata(shard_offsets=[0, 0], shard_sizes=[2048, 64], placement=rank:0/cuda:0), ShardMetadata(shard_offsets=[2048, 0], shard_sizes=[2048, 64], placement=rank:1/cuda:1)])), 'small_table_0': ParameterSharding(sharding_type='row_wise', compute_kernel='batched_fused', ranks=[0, 1], sharding_spec=EnumerableShardingSpec(shards=[ShardMetadata(shard_offsets=[0, 0], shard_sizes=[512, 64], placement=rank:0/cuda:0), ShardMetadata(shard_offsets=[512, 0], shard_sizes=[512, 64], placement=rank:1/cuda:1)])), 'small_table_1': ParameterSharding(sharding_type='row_wise', compute_kernel='batched_fused', ranks=[0, 1], sharding_spec=EnumerableShardingSpec(shards=[ShardMetadata(shard_offsets=[0, 0], shard_sizes=[512, 64], placement=rank:0/cuda:0), ShardMetadata(shard_offsets=[512, 0], shard_sizes=[512, 64], placement=rank:1/cuda:1)]))}}
+
+Column-wise on the other hand, address the load imbalance problems for
+tables with large embedding dimensions. We will split the table
+vertically. Users can see that in the ``shard_sizes`` section in the
+printed plan log, the tables are halved by embedding dimension to be
+distributed onto two GPUs.
+
+.. code:: python
+
+    spmd_sharing_simulation(ShardingType.COLUMN_WISE)
+
+
+.. parsed-literal::
+
+    rank:0,sharding plan: {'': {'large_table_0': ParameterSharding(sharding_type='column_wise', compute_kernel='batched_fused', ranks=[0, 1], sharding_spec=EnumerableShardingSpec(shards=[ShardMetadata(shard_offsets=[0, 0], shard_sizes=[4096, 32], placement=rank:0/cuda:0), ShardMetadata(shard_offsets=[0, 32], shard_sizes=[4096, 32], placement=rank:1/cuda:1)])), 'large_table_1': ParameterSharding(sharding_type='column_wise', compute_kernel='batched_fused', ranks=[0, 1], sharding_spec=EnumerableShardingSpec(shards=[ShardMetadata(shard_offsets=[0, 0], shard_sizes=[4096, 32], placement=rank:0/cuda:0), ShardMetadata(shard_offsets=[0, 32], shard_sizes=[4096, 32], placement=rank:1/cuda:1)])), 'small_table_0': ParameterSharding(sharding_type='column_wise', compute_kernel='batched_fused', ranks=[0, 1], sharding_spec=EnumerableShardingSpec(shards=[ShardMetadata(shard_offsets=[0, 0], shard_sizes=[1024, 32], placement=rank:0/cuda:0), ShardMetadata(shard_offsets=[0, 32], shard_sizes=[1024, 32], placement=rank:1/cuda:1)])), 'small_table_1': ParameterSharding(sharding_type='column_wise', compute_kernel='batched_fused', ranks=[0, 1], sharding_spec=EnumerableShardingSpec(shards=[ShardMetadata(shard_offsets=[0, 0], shard_sizes=[1024, 32], placement=rank:0/cuda:0), ShardMetadata(shard_offsets=[0, 32], shard_sizes=[1024, 32], placement=rank:1/cuda:1)]))}}
+    rank:1,sharding plan: {'': {'large_table_0': ParameterSharding(sharding_type='column_wise', compute_kernel='batched_fused', ranks=[0, 1], sharding_spec=EnumerableShardingSpec(shards=[ShardMetadata(shard_offsets=[0, 0], shard_sizes=[4096, 32], placement=rank:0/cuda:0), ShardMetadata(shard_offsets=[0, 32], shard_sizes=[4096, 32], placement=rank:1/cuda:1)])), 'large_table_1': ParameterSharding(sharding_type='column_wise', compute_kernel='batched_fused', ranks=[0, 1], sharding_spec=EnumerableShardingSpec(shards=[ShardMetadata(shard_offsets=[0, 0], shard_sizes=[4096, 32], placement=rank:0/cuda:0), ShardMetadata(shard_offsets=[0, 32], shard_sizes=[4096, 32], placement=rank:1/cuda:1)])), 'small_table_0': ParameterSharding(sharding_type='column_wise', compute_kernel='batched_fused', ranks=[0, 1], sharding_spec=EnumerableShardingSpec(shards=[ShardMetadata(shard_offsets=[0, 0], shard_sizes=[1024, 32], placement=rank:0/cuda:0), ShardMetadata(shard_offsets=[0, 32], shard_sizes=[1024, 32], placement=rank:1/cuda:1)])), 'small_table_1': ParameterSharding(sharding_type='column_wise', compute_kernel='batched_fused', ranks=[0, 1], sharding_spec=EnumerableShardingSpec(shards=[ShardMetadata(shard_offsets=[0, 0], shard_sizes=[1024, 32], placement=rank:0/cuda:0), ShardMetadata(shard_offsets=[0, 32], shard_sizes=[1024, 32], placement=rank:1/cuda:1)]))}}
+
+For ``table-row-wise``, unfortuately we cannot simulate it due to its
+nature of operating under multi-host setup. We will present a python
+`SPMD <https://en.wikipedia.org/wiki/SPMD>`_ example in the future
+to train models with ``table-row-wise``.
+
+With data parallel, we will repeat the tables for all devices.
+
+.. code:: python
+
+    spmd_sharing_simulation(ShardingType.DATA_PARALLEL)
+
+
+.. parsed-literal::
+
+    rank:0,sharding plan: {'': {'large_table_0': ParameterSharding(sharding_type='data_parallel', compute_kernel='batched_dense', ranks=[0, 1], sharding_spec=None), 'large_table_1': ParameterSharding(sharding_type='data_parallel', compute_kernel='batched_dense', ranks=[0, 1], sharding_spec=None), 'small_table_0': ParameterSharding(sharding_type='data_parallel', compute_kernel='batched_dense', ranks=[0, 1], sharding_spec=None), 'small_table_1': ParameterSharding(sharding_type='data_parallel', compute_kernel='batched_dense', ranks=[0, 1], sharding_spec=None)}}
+    rank:1,sharding plan: {'': {'large_table_0': ParameterSharding(sharding_type='data_parallel', compute_kernel='batched_dense', ranks=[0, 1], sharding_spec=None), 'large_table_1': ParameterSharding(sharding_type='data_parallel', compute_kernel='batched_dense', ranks=[0, 1], sharding_spec=None), 'small_table_0': ParameterSharding(sharding_type='data_parallel', compute_kernel='batched_dense', ranks=[0, 1], sharding_spec=None), 'small_table_1': ParameterSharding(sharding_type='data_parallel', compute_kernel='batched_dense', ranks=[0, 1], sharding_spec=None)}}
+
diff --git a/advanced_source/torch-script-parallelism.rst b/advanced_source/torch-script-parallelism.rst
new file mode 100644
index 00000000000..56c4bcbaae7
--- /dev/null
+++ b/advanced_source/torch-script-parallelism.rst
@@ -0,0 +1,3 @@
+.. warning::
+    TorchScript is deprecated, please use 
+    `torch.export <https://docs.pytorch.org/tutorials/intermediate/torch_export_tutorial.html>`__ instead.
\ No newline at end of file
diff --git a/advanced_source/torch_script_custom_ops.rst b/advanced_source/torch_script_custom_ops.rst
new file mode 100644
index 00000000000..01bc497d38e
--- /dev/null
+++ b/advanced_source/torch_script_custom_ops.rst
@@ -0,0 +1,6 @@
+..
+  TODO(gmagogsfm): Replace/delete this document by 2.9 release. https://github.com/pytorch/tutorials/issues/3456
+
+.. warning::
+    TorchScript is deprecated, please use 
+    `torch.export <https://docs.pytorch.org/tutorials/intermediate/torch_export_tutorial.html>`__ instead.
\ No newline at end of file
diff --git a/advanced_source/transformer__timeseries_cpp_tutorial/CMakeLists.txt b/advanced_source/transformer__timeseries_cpp_tutorial/CMakeLists.txt
new file mode 100644
index 00000000000..a8246f6886d
--- /dev/null
+++ b/advanced_source/transformer__timeseries_cpp_tutorial/CMakeLists.txt
@@ -0,0 +1,8 @@
+cmake_minimum_required(VERSION 3.0 FATAL_ERROR)
+project(custom_ops)
+
+find_package(Torch REQUIRED)
+
+add_executable(transformer_ts transformer_timeseries.cpp)
+target_link_libraries(transformer_ts "${TORCH_LIBRARIES}")
+set_property(TARGET transformer_ts PROPERTY CXX_STANDARD 14)
diff --git a/advanced_source/transformer__timeseries_cpp_tutorial/scheduler.h b/advanced_source/transformer__timeseries_cpp_tutorial/scheduler.h
new file mode 100644
index 00000000000..9daabd5192e
--- /dev/null
+++ b/advanced_source/transformer__timeseries_cpp_tutorial/scheduler.h
@@ -0,0 +1,112 @@
+// Copyright 2020-present pytorch-cpp Authors
+#pragma once
+
+#include <torch/torch.h>
+#include <vector>
+#include <algorithm>
+
+namespace scheduler {
+template<typename TOptimizer>
+struct OptimizerOptionsMap {
+};
+
+template<>
+struct OptimizerOptionsMap<torch::optim::Adam> {
+    using type = torch::optim::AdamOptions;
+};
+
+template<>
+struct OptimizerOptionsMap<torch::optim::Adagrad> {
+    using type = torch::optim::AdagradOptions;
+};
+
+template<>
+struct OptimizerOptionsMap<torch::optim::LBFGS> {
+    using type = torch::optim::LBFGSOptions;
+};
+
+template<>
+struct OptimizerOptionsMap<torch::optim::RMSprop> {
+    using type = torch::optim::RMSpropOptions;
+};
+
+template<>
+struct OptimizerOptionsMap<torch::optim::SGD> {
+    using type = torch::optim::SGDOptions;
+};
+
+/**
+ * Learning rate scheduler base.
+ *
+ * Based on the Python implementation at
+ * https://github.com/pytorch/pytorch/blob/master/torch/optim/lr_scheduler.py.
+ * @tparam TOptimizer Optimizer type
+ */
+template<typename TOptimizer>
+class LRScheduler {
+ public:
+    explicit LRScheduler(TOptimizer& optimizer, int64_t last_epoch = -1)
+            : optimizer_(optimizer), last_epoch_(last_epoch), base_lrs(get_current_lr()) {}
+
+    virtual std::vector<double> get_lr() = 0;
+
+    void step() {
+        ++last_epoch_;
+
+        const auto values = get_lr();
+        auto &param_groups = optimizer_.param_groups();
+
+        for (decltype(param_groups.size()) i = 0; i != param_groups.size(); ++i) {
+            dynamic_cast<typename OptimizerOptionsMap<TOptimizer>::type &>(param_groups[i].options()).lr(values[i]);
+        }
+    }
+
+    virtual ~LRScheduler() = default;
+
+ protected:
+    TOptimizer& optimizer_;
+    int64_t last_epoch_;
+    std::vector<double> base_lrs;
+
+    std::vector<double> get_current_lr() {
+        std::vector<double> lrs;
+        lrs.reserve(optimizer_.param_groups().size());
+
+        for (auto &param_group : optimizer_.param_groups()) {
+            lrs.push_back(dynamic_cast<typename
+            OptimizerOptionsMap<TOptimizer>::type &>(param_group.options()).lr());
+        }
+
+        return lrs;
+    }
+};
+
+/**
+ * Step learning rate scheduler.
+ *
+ * Based on the python implementation at
+ * https://github.com/pytorch/pytorch/blob/master/torch/optim/lr_scheduler.py.
+ * @tparam TOptimizer Optimizer type
+ */
+template<typename TOptimizer>
+class StepLR : public LRScheduler<TOptimizer> {
+ public:
+    StepLR(TOptimizer& optimizer, int64_t step_size, double gamma = 0.1, int64_t last_epoch = -1)
+            : LRScheduler<TOptimizer>(optimizer, last_epoch), step_size_(step_size), gamma_(gamma) {}
+
+    std::vector<double> get_lr() override {
+        auto new_lr = this->get_current_lr();
+
+        if (this->last_epoch_ != 0 && (this->last_epoch_ % step_size_ == 0)) {
+            std::transform(new_lr.cbegin(), new_lr.cend(), new_lr.begin(),
+                           [gamma_ = gamma_](auto value) { return value * gamma_; });
+        }
+
+        return new_lr;
+    }
+
+ private:
+    int64_t step_size_;
+    double gamma_;
+};
+}  // namespace scheduler
\ No newline at end of file
diff --git a/advanced_source/transformer__timeseries_cpp_tutorial/transformer_timeseries.cpp b/advanced_source/transformer__timeseries_cpp_tutorial/transformer_timeseries.cpp
new file mode 100644
index 00000000000..8bd9ebe8c1b
--- /dev/null
+++ b/advanced_source/transformer__timeseries_cpp_tutorial/transformer_timeseries.cpp
@@ -0,0 +1,240 @@
+#include <torch/torch.h>
+#include <math.h>
+#include <iostream>
+#include <cmath>
+#include <limits>
+#include <chrono>
+#include <ctime>
+#include <random>
+#include "scheduler.h"
+
+using namespace torch::indexing;
+
+struct PositionalEncodingImpl : torch::nn::Module{
+    PositionalEncodingImpl(){
+
+    }
+    PositionalEncodingImpl(int64_t d_model, int64_t max_len=5000){
+        pe = torch::zeros({max_len, d_model});
+        position = torch::arange(0, max_len,
+            torch::TensorOptions(torch::kFloat32).requires_grad(false));
+        position = position.unsqueeze(1);
+        torch::Tensor temp = torch::arange(0, d_model, 2, torch::TensorOptions(torch::kFloat32).requires_grad(false));
+        div_term = torch::exp(temp * (std::log(10000.0) / d_model));
+
+
+        pe.index_put_({Slice(), Slice(0, None, 2)}, torch::sin(position * div_term));
+        pe.index_put_({Slice(), Slice(1, None, 2)}, torch::cos(position * div_term));
+
+
+        
+        pe = pe.unsqueeze(0).transpose(0, 1);
+        register_parameter("pe", pe);
+        register_parameter("position", position);
+        register_parameter("div_term", div_term);
+        register_buffer("pe", pe);
+    }
+
+    torch::Tensor forward(torch::Tensor x){
+        x = x + pe.index({Slice(0, x.size(0)), Slice()});
+        return x;
+    }
+
+    torch::Tensor pe;
+    torch::Tensor position;
+    torch::Tensor div_term;
+};
+
+TORCH_MODULE(PositionalEncoding);
+
+struct TransformerModel : torch::nn::Module{
+    TransformerModel(int64_t feature_size = 250, int64_t nlayers = 1, float dropout_p=0.1){
+        pos_encoder = PositionalEncoding(feature_size);
+        torch::nn::TransformerEncoderLayerOptions elOptions = 
+            torch::nn::TransformerEncoderLayerOptions(feature_size, 10);
+        torch::nn::TransformerEncoderLayer encoder_layers = torch::nn::TransformerEncoderLayer(
+            elOptions.dropout(dropout_p));
+        torch::nn::TransformerEncoderOptions enOptions = torch::nn::TransformerEncoderOptions(encoder_layers, nlayers);
+        transformer_encoder = torch::nn::TransformerEncoder(enOptions);
+        decoder = torch::nn::Linear(feature_size, 1);
+        register_module("pos_encoder", pos_encoder);
+        register_module("transformer_encoder", transformer_encoder);
+        register_module("decoder", decoder);
+    }
+
+    void init_weights(){
+        float initrange = 0.1;
+        decoder->bias.data().zero_();
+        decoder->weight.data().uniform_(-initrange, initrange);
+    }
+
+    torch::Tensor _generate_square_subsequent_mask(int sz){
+        torch::Tensor mask = (torch::triu(torch::ones({sz, sz})) == 1).transpose(0, 1).to(torch::kFloat32);
+        mask = mask.masked_fill(mask == 0, -std::numeric_limits<float>::infinity()).masked_fill(mask == 1, 0.f);
+        return mask;
+    }
+
+    torch::Tensor forward(torch::Tensor src){
+        if (false == is_mask_generated){
+            torch::Tensor mask = _generate_square_subsequent_mask(src.size(0)).to(src.device());
+            src_mask = mask;
+            is_mask_generated = true;
+        }
+
+        src = pos_encoder(src);
+        torch::Tensor output = transformer_encoder(src, src_mask);
+        output = decoder(output);
+        return output;
+    }
+
+    torch::Tensor src_mask;
+    bool is_mask_generated = false;
+    PositionalEncoding pos_encoder;
+    torch::nn::TransformerEncoder transformer_encoder = nullptr;
+    torch::nn::Linear decoder = nullptr;
+    int64_t ninp;
+};
+
+torch::Tensor create_inout_sequences(torch::Tensor input_data, int64_t tw, int64_t output_window = 1){
+    torch::Tensor temp = torch::empty({input_data.size(0) - tw, 2, tw}, torch::TensorOptions(torch::kFloat32));
+    auto len = input_data.numel();
+    auto max_counter = len - tw;
+    int64_t k = 0;
+    for (auto i = 0; i < max_counter; i++){
+        torch::Tensor train_seq = input_data.index({Slice(i, i + tw)});
+        temp[i][0] = input_data.index({Slice(i, i + tw)});
+        temp[i][1] = input_data.index({Slice(i + output_window, i + tw + output_window)});
+
+    }
+
+    return temp;
+}
+
+std::tuple<torch::Tensor, torch::Tensor> get_data(int64_t output_window = 1){
+    //construct a little toy dataset
+    auto time = torch::arange(0, 400, 0.1);
+    auto amplitude = torch::sin(time) + torch::sin(time * 0.05) + torch::sin(time * 0.12);// + dist(mt);
+
+    
+    //from sklearn.preprocessing import MinMaxScaler
+
+    
+    //looks like normalizing input values curtial for the model
+    //scaler = MinMaxScaler(feature_range=(-1, 1)) 
+    //amplitude = scaler.fit_transform(series.to_numpy().reshape(-1, 1)).reshape(-1)
+    //amplitude = scaler.fit_transform(amplitude.reshape(-1, 1)).reshape(-1)
+    
+    
+    auto samples = 2600;
+
+    auto train_data = amplitude.index({Slice(None, samples)});
+    auto test_data = amplitude.index({Slice(samples, None)});
+
+    //convert our train data into a pytorch train tensor
+    auto input_window = 100;
+
+    auto train_sequence = create_inout_sequences(train_data,input_window);
+    train_sequence = train_sequence.index({Slice(None,-output_window)});
+    
+    auto test_sequence = create_inout_sequences(test_data,input_window);
+    test_sequence = test_sequence.index({Slice(None,-output_window)});
+
+    auto cuda_available = torch::cuda::is_available();
+    torch::Device device(cuda_available ? torch::kCUDA : torch::kCPU);
+
+    return std::make_tuple(train_sequence.to(device),test_sequence.to(device));
+}
+
+std::tuple<torch::Tensor, torch::Tensor> get_batch(torch::Tensor source, int64_t i, int64_t batch_size, int64_t input_window = 100){
+    auto seq_len = std::min(batch_size, source.size(0) - i);
+    
+    auto data = source.index({Slice(i, i + seq_len)});
+    auto input = data.index({Slice(), 0, Slice()});
+    auto target = data.index({Slice(), 1, Slice()});
+    auto temp = input.numel()/100;
+    if (temp > 10)
+        temp = 10;
+    input = torch::reshape(input, {100, temp, 1});
+    target = torch::reshape(target, {100, temp, 1});
+    return std::make_tuple(input, target);
+}
+
+
+void train(TransformerModel model, torch::Tensor train_data, int64_t num_epochs = 100){
+    model.train();
+    auto total_loss = 0.0;
+    auto start_time = std::chrono::system_clock::now();
+    auto batch_size = 10;
+    auto batch = 0;
+
+    torch::nn::MSELoss criterion;
+
+
+    auto learning_rate = 0.005;
+    torch::optim::SGD optimizer(model.parameters(), torch::optim::SGDOptions(learning_rate));
+    scheduler::StepLR<decltype(optimizer)> scheduler(optimizer, 1.0, 0.95);
+
+    for(int64_t i = 0; i <= num_epochs; i++){
+        auto start_time = std::chrono::system_clock::now();
+        std::cout<<"Epoch "<<i<<std::endl;
+        batch = 0;
+        for (int64_t j = 0; j < train_data.size(0); j = j + batch_size, batch++){
+            auto data = get_batch(train_data, j, batch_size);
+            optimizer.zero_grad();
+            auto output = model.forward(std::get<0>(data));
+            
+            auto loss = criterion(output, std::get<1>(data));
+            loss.backward();
+            torch::nn::utils::clip_grad_norm_(model.parameters(), 0.7);
+            optimizer.step();
+            total_loss += loss.item<double>();
+            auto log_interval = int(train_data.size(0)) / (batch_size * 5);
+            if (batch != 0 && 0 == batch % log_interval){
+                auto curr_loss = total_loss / log_interval;
+                auto elapsed = std::chrono::system_clock::now() - start_time;
+                std::cout<<"|epoch "<<i<<" | "<<batch<<"/"<<train_data.size(0)/batch_size;
+                std::cout<<" batches | "<<(elapsed.count() * 10)<<" ms | loss"<<curr_loss<<std::endl;;
+                total_loss = 0;
+                start_time = std::chrono::system_clock::now();
+            }
+        }
+
+        scheduler.step();
+    }
+
+    return;
+}
+
+void evaluate(TransformerModel model, torch::Tensor eval_data){
+    model.eval();
+    auto batch_size = 10;
+    auto total_loss = 0.0;
+    torch::nn::MSELoss criterion;
+
+    std::cout<<"Evaluating:";
+    for (int64_t j = 0; j < eval_data.size(0); j = j + batch_size){
+            auto data = get_batch(eval_data, j, batch_size);
+            auto output = model.forward(std::get<0>(data));
+            auto loss = criterion(output, std::get<1>(data));
+            total_loss += loss.item<double>();
+    }
+
+    std::cout<<"Evaluation Loss: "<<total_loss<<std::endl;
+    return;
+}
+
+int main(){
+    auto cuda_available = torch::cuda::is_available();
+    torch::Device device(cuda_available ? torch::kCUDA : torch::kCPU);
+
+    auto model = TransformerModel();
+    model.to(device);
+
+    auto data = get_data();
+    train(model, std::get<0>(data));
+    evaluate(model, std::get<1>(data));
+
+    return 0;
+    
+}
+
diff --git a/advanced_source/usb_semisup_learn.py b/advanced_source/usb_semisup_learn.py
new file mode 100644
index 00000000000..4ea6f621ab7
--- /dev/null
+++ b/advanced_source/usb_semisup_learn.py
@@ -0,0 +1,253 @@
+"""
+Semi-Supervised Learning using USB built upon PyTorch
+=====================================================
+
+**Author**: `Hao Chen <https://github.com/Hhhhhhao>`_
+
+Unified Semi-supervised learning Benchmark (USB) is a semi-supervised
+learning (SSL) framework built upon PyTorch.
+Based on Datasets and Modules provided by PyTorch, USB becomes a flexible,
+modular, and easy-to-use framework for semi-supervised learning.
+It supports a variety of semi-supervised learning algorithms, including
+``FixMatch``, ``FreeMatch``, ``DeFixMatch``, ``SoftMatch``, and so on.
+It also supports a variety of imbalanced semi-supervised learning algorithms.
+The benchmark results across different datasets of computer vision, natural
+language processing, and speech processing are included in USB.
+
+This tutorial will walk you through the basics of using the USB lighting
+package.
+Let's get started by training a ``FreeMatch``/``SoftMatch`` model on
+CIFAR-10 using pretrained Vision Transformers (ViT)!
+And we will show it is easy to change the semi-supervised algorithm and train
+on imbalanced datasets.
+
+
+.. figure:: /_static/img/usb_semisup_learn/code.png
+   :alt: USB framework illustration
+"""
+
+
+######################################################################
+# Introduction to ``FreeMatch`` and ``SoftMatch`` in Semi-Supervised Learning
+# ---------------------------------------------------------------------------
+#
+# Here we provide a brief introduction to ``FreeMatch`` and ``SoftMatch``.
+# First, we introduce a famous baseline for semi-supervised learning called ``FixMatch``.
+# ``FixMatch`` is a very simple framework for semi-supervised learning, where it
+# utilizes a strong augmentation to generate pseudo labels for unlabeled data.
+# It adopts a confidence thresholding strategy to filter out the low-confidence
+# pseudo labels with a fixed threshold set.
+# ``FreeMatch`` and ``SoftMatch`` are two algorithms that improve upon ``FixMatch``.
+# ``FreeMatch`` proposes adaptive thresholding strategy to replace the fixed
+# thresholding strategy in ``FixMatch``. The adaptive thresholding progressively
+# increases the threshold according to the learning status of the model on each
+# class. ``SoftMatch`` absorbs the idea of confidence thresholding as an
+# weighting mechanism. It proposes a Gaussian weighting mechanism to overcome
+# the quantity-quality trade-off in pseudo-labels. In this tutorial, we will
+# use USB to train ``FreeMatch`` and ``SoftMatch``.
+
+
+######################################################################
+# Use USB to Train ``FreeMatch``/``SoftMatch`` on CIFAR-10 with only 40 labels
+# ----------------------------------------------------------------------------
+#
+# USB is easy to use and extend, affordable to small groups, and comprehensive
+# for developing and evaluating SSL algorithms.
+# USB provides the implementation of 14 SSL algorithms based on Consistency
+# Regularization, and 15 tasks for evaluation from CV, NLP, and Audio domain.
+# It has a modular design that allows users to easily extend the package by
+# adding new algorithms and tasks.
+# It also supports a Python API for easier adaptation to different SSL
+# algorithms on new data.
+#
+#
+# Now, let's use USB to train ``FreeMatch`` and ``SoftMatch`` on CIFAR-10.
+# First, we need to install USB package ``semilearn`` and import necessary API
+# functions from USB.
+# If you are running this in Google Colab, install ``semilearn`` by running:
+# ``!pip install semilearn``.
+#
+# Below is a list of functions we will use from ``semilearn``:
+#
+# - ``get_dataset`` to load dataset, here we use CIFAR-10
+# - ``get_data_loader`` to create train (labeled and unlabeled) and test data
+# loaders, the train unlabeled loaders will provide both strong and weak
+# augmentation of unlabeled data
+# - ``get_net_builder`` to create a model, here we use pretrained ViT
+# - ``get_algorithm`` to create the semi-supervised learning algorithm,
+# here we use ``FreeMatch`` and ``SoftMatch``
+# - ``get_config``: to get default configuration of the algorithm
+# - ``Trainer``: a Trainer class for training and evaluating the
+# algorithm on dataset
+# 
+# Note that a CUDA-enabled backend is required for training with the ``semilearn`` package.
+# See `Enabling CUDA in Google Colab <https://pytorch.org/tutorials/beginner/colab#enabling-cuda>`__ for instructions
+# on enabling CUDA in Google Colab.
+#
+import semilearn
+from semilearn import get_dataset, get_data_loader, get_net_builder, get_algorithm, get_config, Trainer
+
+######################################################################
+# After importing necessary functions, we first set the hyper-parameters of the
+# algorithm.
+# 
+config = {
+    'algorithm': 'freematch',
+    'net': 'vit_tiny_patch2_32',
+    'use_pretrain': True, 
+    'pretrain_path': 'https://github.com/microsoft/Semi-supervised-learning/releases/download/v.0.0.0/vit_tiny_patch2_32_mlp_im_1k_32.pth',
+
+    # optimization configs
+    'epoch': 1,  
+    'num_train_iter': 500,
+    'num_eval_iter': 500,  
+    'num_log_iter': 50,  
+    'optim': 'AdamW',
+    'lr': 5e-4,
+    'layer_decay': 0.5,
+    'batch_size': 16,
+    'eval_batch_size': 16,
+
+
+    # dataset configs
+    'dataset': 'cifar10',
+    'num_labels': 40,
+    'num_classes': 10,
+    'img_size': 32,
+    'crop_ratio': 0.875,
+    'data_dir': './data',
+    'ulb_samples_per_class': None,
+
+    # algorithm specific configs
+    'hard_label': True,
+    'T': 0.5,
+    'ema_p': 0.999,
+    'ent_loss_ratio': 0.001,
+    'uratio': 2,
+    'ulb_loss_ratio': 1.0,
+
+    # device configs
+    'gpu': 0,
+    'world_size': 1,
+    'distributed': False,
+    "num_workers": 4,
+}
+config = get_config(config)
+
+
+######################################################################
+# Then, we load the dataset and create data loaders for training and testing.
+# And we specify the model and algorithm to use.
+# 
+dataset_dict = get_dataset(config, config.algorithm, config.dataset, config.num_labels, config.num_classes, data_dir=config.data_dir, include_lb_to_ulb=config.include_lb_to_ulb)
+train_lb_loader = get_data_loader(config, dataset_dict['train_lb'], config.batch_size)
+train_ulb_loader = get_data_loader(config, dataset_dict['train_ulb'], int(config.batch_size * config.uratio))
+eval_loader = get_data_loader(config, dataset_dict['eval'], config.eval_batch_size)
+algorithm = get_algorithm(config,  get_net_builder(config.net, from_name=False), tb_log=None, logger=None)
+
+
+######################################################################
+# We can start training the algorithms on CIFAR-10 with 40 labels now.
+# We train for 500 iterations and evaluate every 500 iterations.
+# 
+trainer = Trainer(config, algorithm)
+trainer.fit(train_lb_loader, train_ulb_loader, eval_loader)
+
+
+######################################################################
+# Finally, let's evaluate the trained model on the validation set.
+# After training 500 iterations with ``FreeMatch`` on only 40 labels of
+# CIFAR-10, we obtain a classifier that achieves around 87% accuracy on the validation set.
+trainer.evaluate(eval_loader)
+
+
+
+######################################################################
+# Use USB to Train ``SoftMatch`` with specific imbalanced algorithm on imbalanced CIFAR-10
+# ----------------------------------------------------------------------------------------
+# 
+# Now let's say we have imbalanced labeled set and unlabeled set of CIFAR-10,
+# and we want to train a ``SoftMatch`` model on it.
+# We create an imbalanced labeled set and imbalanced unlabeled set of CIFAR-10,
+# by setting the ``lb_imb_ratio`` and ``ulb_imb_ratio`` to 10.
+# Also, we replace the ``algorithm`` with ``softmatch`` and set the ``imbalanced``
+# to ``True``.
+# 
+config = {
+    'algorithm': 'softmatch',
+    'net': 'vit_tiny_patch2_32',
+    'use_pretrain': True, 
+    'pretrain_path': 'https://github.com/microsoft/Semi-supervised-learning/releases/download/v.0.0.0/vit_tiny_patch2_32_mlp_im_1k_32.pth',
+
+    # optimization configs
+    'epoch': 1,  
+    'num_train_iter': 500,
+    'num_eval_iter': 500,  
+    'num_log_iter': 50,  
+    'optim': 'AdamW',
+    'lr': 5e-4,
+    'layer_decay': 0.5,
+    'batch_size': 16,
+    'eval_batch_size': 16,
+
+
+    # dataset configs
+    'dataset': 'cifar10',
+    'num_labels': 1500,
+    'num_classes': 10,
+    'img_size': 32,
+    'crop_ratio': 0.875,
+    'data_dir': './data',
+    'ulb_samples_per_class': None,
+    'lb_imb_ratio': 10,
+    'ulb_imb_ratio': 10,
+    'ulb_num_labels': 3000,
+
+    # algorithm specific configs
+    'hard_label': True,
+    'T': 0.5,
+    'ema_p': 0.999,
+    'ent_loss_ratio': 0.001,
+    'uratio': 2,
+    'ulb_loss_ratio': 1.0,
+
+    # device configs
+    'gpu': 0,
+    'world_size': 1,
+    'distributed': False,
+    "num_workers": 4,
+}
+config = get_config(config)
+
+######################################################################
+# Then, we re-load the dataset and create data loaders for training and testing.
+# And we specify the model and algorithm to use.
+# 
+dataset_dict = get_dataset(config, config.algorithm, config.dataset, config.num_labels, config.num_classes, data_dir=config.data_dir, include_lb_to_ulb=config.include_lb_to_ulb)
+train_lb_loader = get_data_loader(config, dataset_dict['train_lb'], config.batch_size)
+train_ulb_loader = get_data_loader(config, dataset_dict['train_ulb'], int(config.batch_size * config.uratio))
+eval_loader = get_data_loader(config, dataset_dict['eval'], config.eval_batch_size)
+algorithm = get_algorithm(config,  get_net_builder(config.net, from_name=False), tb_log=None, logger=None)
+
+
+######################################################################
+# We can start Train the algorithms on CIFAR-10 with 40 labels now.
+# We train for 500 iterations and evaluate every 500 iterations.
+# 
+trainer = Trainer(config, algorithm)
+trainer.fit(train_lb_loader, train_ulb_loader, eval_loader)
+
+
+######################################################################
+# Finally, let's evaluate the trained model on the validation set.
+# 
+trainer.evaluate(eval_loader)
+
+
+
+######################################################################
+# References:
+# - [1] USB: https://github.com/microsoft/Semi-supervised-learning
+# - [2] Kihyuk Sohn et al. FixMatch: Simplifying Semi-Supervised Learning with Consistency and Confidence
+# - [3] Yidong Wang et al. FreeMatch: Self-adaptive Thresholding for Semi-supervised Learning
+# - [4] Hao Chen et al. SoftMatch: Addressing the Quantity-Quality Trade-off in Semi-supervised Learning
diff --git a/beginner_source/README.txt b/beginner_source/README.txt
index 16f8a51db76..14f3b3047e9 100644
--- a/beginner_source/README.txt
+++ b/beginner_source/README.txt
@@ -3,20 +3,20 @@ Beginner Tutorials
 
 1. blitz/* and deep_learning_60min_blitz.rst
 	Deep Learning with PyTorch: A 60 Minute Blitz
-	http://pytorch.org/tutorials/beginner/deep_learning_60min_blitz.html
+	https://pytorch.org/tutorials/beginner/deep_learning_60min_blitz.html
 
 2. former_torches/* and former_torchies_tutorial.rst
-	PyTorch for former Torch users
-	http://pytorch.org/tutorials/beginner/former_torchies_tutorial.html
+	PyTorch for Former Torch Users
+	https://pytorch.org/tutorials/beginner/former_torchies_tutorial.html
 
 3. examples_*/* and pytorch_with_examples.rst
 	Learning PyTorch with Examples
-	http://pytorch.org/tutorials/beginner/pytorch_with_examples.html
+	https://pytorch.org/tutorials/beginner/pytorch_with_examples.html
 
 4. transfer_learning_tutorial.py
-	Transfer Learning tutorial
-	http://pytorch.org/tutorials/beginner/transfer_learning_tutorial.html
+	Transfer Learning Tutorial
+	https://pytorch.org/tutorials/beginner/transfer_learning_tutorial.html
 
 5. nlp/* and deep_learning_nlp_tutorial.rst
 	Deep Learning for NLP with Pytorch
-	http://pytorch.org/tutorials/beginner/deep_learning_nlp_tutorial.html
\ No newline at end of file
+	https://pytorch.org/tutorials/beginner/deep_learning_nlp_tutorial.html
diff --git a/beginner_source/audio_data_augmentation_tutorial.rst b/beginner_source/audio_data_augmentation_tutorial.rst
new file mode 100644
index 00000000000..55ba024a590
--- /dev/null
+++ b/beginner_source/audio_data_augmentation_tutorial.rst
@@ -0,0 +1,10 @@
+Audio Data Augmentation
+=======================
+
+This tutorial has been moved to https://pytorch.org/audio/stable/tutorials/audio_data_augmentation_tutorial.html
+
+It will redirect in 3 seconds.
+
+.. raw:: html
+
+   <meta http-equiv="Refresh" content="3; url='https://pytorch.org/audio/stable/tutorials/audio_data_augmentation_tutorial.html'" />
diff --git a/beginner_source/audio_datasets_tutorial.rst b/beginner_source/audio_datasets_tutorial.rst
new file mode 100644
index 00000000000..6e9b4f4f48a
--- /dev/null
+++ b/beginner_source/audio_datasets_tutorial.rst
@@ -0,0 +1,10 @@
+Audio Datasets
+==============
+
+This tutorial has been moved to https://pytorch.org/audio/stable/tutorials/audio_datasets_tutorial.html
+
+It will redirect in 3 seconds.
+
+.. raw:: html
+
+   <meta http-equiv="Refresh" content="3; url='https://pytorch.org/audio/stable/tutorials/audio_datasets_tutorial.html'" />
diff --git a/beginner_source/audio_feature_augmentation_tutorial.rst b/beginner_source/audio_feature_augmentation_tutorial.rst
new file mode 100644
index 00000000000..55d3811b3fa
--- /dev/null
+++ b/beginner_source/audio_feature_augmentation_tutorial.rst
@@ -0,0 +1,10 @@
+Audio Feature Augmentation
+==========================
+
+This tutorial has been moved to https://pytorch.org/audio/stable/tutorials/audio_data_augmentation_tutorial.html
+
+It will redirect in 3 seconds.
+
+.. raw:: html
+
+   <meta http-equiv="Refresh" content="3; url='https://pytorch.org/audio/stable/tutorials/audio_data_augmentation_tutorial.html'" />
diff --git a/beginner_source/audio_feature_extractions_tutorial.rst b/beginner_source/audio_feature_extractions_tutorial.rst
new file mode 100644
index 00000000000..a2a8da4ab75
--- /dev/null
+++ b/beginner_source/audio_feature_extractions_tutorial.rst
@@ -0,0 +1,10 @@
+Audio Feature Extractions
+=========================
+
+This tutorial has been moved to https://pytorch.org/audio/stable/tutorials/audio_feature_extractions_tutorial.html
+
+It will redirect in 3 seconds.
+
+.. raw:: html
+
+   <meta http-equiv="Refresh" content="3; url='https://pytorch.org/audio/stable/tutorials/audio_feature_extractions_tutorial.html'" />
diff --git a/beginner_source/audio_io_tutorial.rst b/beginner_source/audio_io_tutorial.rst
new file mode 100644
index 00000000000..3263ad93a98
--- /dev/null
+++ b/beginner_source/audio_io_tutorial.rst
@@ -0,0 +1,10 @@
+Audio I/O
+=========
+
+This tutorial has been moved to https://pytorch.org/audio/stable/tutorials/audio_io_tutorial.html
+
+It will redirect in 3 seconds.
+
+.. raw:: html
+
+   <meta http-equiv="Refresh" content="3; url='https://pytorch.org/audio/stable/tutorials/audio_io_tutorial.html'" />
diff --git a/beginner_source/audio_resampling_tutorial.rst b/beginner_source/audio_resampling_tutorial.rst
new file mode 100644
index 00000000000..01210830eb7
--- /dev/null
+++ b/beginner_source/audio_resampling_tutorial.rst
@@ -0,0 +1,9 @@
+Audio Resampling
+================
+
+This tutorial has been moved to `a new location <https://pytorch.org/audio/stable/tutorials/audio_resampling_tutorial.html>`_
+You will be redirected in 3 seconds.
+
+.. raw:: html
+
+   <meta http-equiv="Refresh" content="3; url='https://pytorch.org/audio/stable/tutorials/audio_resampling_tutorial.html'" />
diff --git a/beginner_source/basics/README.txt b/beginner_source/basics/README.txt
new file mode 100644
index 00000000000..d247c7253ea
--- /dev/null
+++ b/beginner_source/basics/README.txt
@@ -0,0 +1,38 @@
+Learn the Basics
+------------------
+
+1. intro.py
+    Learn the Basics
+    https://pytorch.org/tutorials/beginner/basics/intro.html
+
+2. quickstart_tutorial.py
+    Quickstart
+    https://pytorch.org/tutorials/beginner/basics/quickstart_tutorial.html
+
+3. tensorqs_tutorial.py
+    Tensors
+    https://pytorch.org/tutorials/beginner/basics/tensor_tutorial.html
+
+4. data_tutorial.py
+    Datasets & DataLoaders
+    https://pytorch.org/tutorials/beginner/basics/data_tutorial.html
+
+5. transforms_tutorial.py
+    Transforms
+    https://pytorch.org/tutorials/beginner/basics/transforms_tutorial.html
+
+6. buildmodel_tutorial.py
+    Building the Neural Network
+    https://pytorch.org/tutorials/beginner/basics/buildmodel_tutorial.html
+
+7. autogradqs_tutorial.py
+    Automatic Differentiation with torch.autograd_tutorial
+    https://pytorch.org/tutorials/beginner/basics/autograd_tutorial.html
+
+8. optimization_tutorial.py
+    Optimizing Model Parameters
+    https://pytorch.org/tutorials/beginner/basics/optimization_tutorial.html
+
+9. saveloadrun_tutorial.py
+    Save and Load the Model
+    https://pytorch.org/tutorials/beginner/basics/saveloadrun_tutorial.html
diff --git a/beginner_source/basics/autogradqs_tutorial.py b/beginner_source/basics/autogradqs_tutorial.py
new file mode 100644
index 00000000000..8eff127ddee
--- /dev/null
+++ b/beginner_source/basics/autogradqs_tutorial.py
@@ -0,0 +1,240 @@
+"""
+`Learn the Basics <intro.html>`_ ||
+`Quickstart <quickstart_tutorial.html>`_ ||
+`Tensors <tensorqs_tutorial.html>`_ ||
+`Datasets & DataLoaders <data_tutorial.html>`_ ||
+`Transforms <transforms_tutorial.html>`_ ||
+`Build Model <buildmodel_tutorial.html>`_ ||
+**Autograd** ||
+`Optimization <optimization_tutorial.html>`_ ||
+`Save & Load Model <saveloadrun_tutorial.html>`_
+
+Automatic Differentiation with ``torch.autograd``
+=================================================
+
+When training neural networks, the most frequently used algorithm is
+**back propagation**. In this algorithm, parameters (model weights) are
+adjusted according to the **gradient** of the loss function with respect
+to the given parameter.
+
+To compute those gradients, PyTorch has a built-in differentiation engine
+called ``torch.autograd``. It supports automatic computation of gradient for any
+computational graph.
+
+Consider the simplest one-layer neural network, with input ``x``,
+parameters ``w`` and ``b``, and some loss function. It can be defined in
+PyTorch in the following manner:
+"""
+
+import torch
+
+x = torch.ones(5)  # input tensor
+y = torch.zeros(3)  # expected output
+w = torch.randn(5, 3, requires_grad=True)
+b = torch.randn(3, requires_grad=True)
+z = torch.matmul(x, w)+b
+loss = torch.nn.functional.binary_cross_entropy_with_logits(z, y)
+
+
+######################################################################
+# Tensors, Functions and Computational graph
+# ------------------------------------------
+#
+# This code defines the following **computational graph**:
+#
+# .. figure:: /_static/img/basics/comp-graph.png
+#    :alt:
+#
+# In this network, ``w`` and ``b`` are **parameters**, which we need to
+# optimize. Thus, we need to be able to compute the gradients of loss
+# function with respect to those variables. In order to do that, we set
+# the ``requires_grad`` property of those tensors.
+
+#######################################################################
+# .. note:: You can set the value of ``requires_grad`` when creating a
+#           tensor, or later by using ``x.requires_grad_(True)`` method.
+
+#######################################################################
+# A function that we apply to tensors to construct computational graph is
+# in fact an object of class ``Function``. This object knows how to
+# compute the function in the *forward* direction, and also how to compute
+# its derivative during the *backward propagation* step. A reference to
+# the backward propagation function is stored in ``grad_fn`` property of a
+# tensor. You can find more information of ``Function`` `in the
+# documentation <https://pytorch.org/docs/stable/autograd.html#function>`__.
+#
+
+print(f"Gradient function for z = {z.grad_fn}")
+print(f"Gradient function for loss = {loss.grad_fn}")
+
+######################################################################
+# Computing Gradients
+# -------------------
+#
+# To optimize weights of parameters in the neural network, we need to
+# compute the derivatives of our loss function with respect to parameters,
+# namely, we need :math:`\frac{\partial loss}{\partial w}` and
+# :math:`\frac{\partial loss}{\partial b}` under some fixed values of
+# ``x`` and ``y``. To compute those derivatives, we call
+# ``loss.backward()``, and then retrieve the values from ``w.grad`` and
+# ``b.grad``:
+#
+
+loss.backward()
+print(w.grad)
+print(b.grad)
+
+
+######################################################################
+# .. note::
+#   - We can only obtain the ``grad`` properties for the leaf
+#     nodes of the computational graph, which have ``requires_grad`` property
+#     set to ``True``. For all other nodes in our graph, gradients will not be
+#     available.
+#   - We can only perform gradient calculations using
+#     ``backward`` once on a given graph, for performance reasons. If we need
+#     to do several ``backward`` calls on the same graph, we need to pass
+#     ``retain_graph=True`` to the ``backward`` call.
+#
+
+
+######################################################################
+# Disabling Gradient Tracking
+# ---------------------------
+#
+# By default, all tensors with ``requires_grad=True`` are tracking their
+# computational history and support gradient computation. However, there
+# are some cases when we do not need to do that, for example, when we have
+# trained the model and just want to apply it to some input data, i.e. we
+# only want to do *forward* computations through the network. We can stop
+# tracking computations by surrounding our computation code with
+# ``torch.no_grad()`` block:
+#
+
+z = torch.matmul(x, w)+b
+print(z.requires_grad)
+
+with torch.no_grad():
+    z = torch.matmul(x, w)+b
+print(z.requires_grad)
+
+
+######################################################################
+# Another way to achieve the same result is to use the ``detach()`` method
+# on the tensor:
+#
+
+z = torch.matmul(x, w)+b
+z_det = z.detach()
+print(z_det.requires_grad)
+
+######################################################################
+# There are reasons you might want to disable gradient tracking:
+#   - To mark some parameters in your neural network as **frozen parameters**.
+#   - To **speed up computations** when you are only doing forward pass, because computations on tensors that do
+#     not track gradients would be more efficient.
+
+
+######################################################################
+
+######################################################################
+# More on Computational Graphs
+# ----------------------------
+# Conceptually, autograd keeps a record of data (tensors) and all executed
+# operations (along with the resulting new tensors) in a directed acyclic
+# graph (DAG) consisting of
+# `Function <https://pytorch.org/docs/stable/autograd.html#torch.autograd.Function>`__
+# objects. In this DAG, leaves are the input tensors, roots are the output
+# tensors. By tracing this graph from roots to leaves, you can
+# automatically compute the gradients using the chain rule.
+#
+# In a forward pass, autograd does two things simultaneously:
+#
+# - run the requested operation to compute a resulting tensor
+# - maintain the operation’s *gradient function* in the DAG.
+#
+# The backward pass kicks off when ``.backward()`` is called on the DAG
+# root. ``autograd`` then:
+#
+# - computes the gradients from each ``.grad_fn``,
+# - accumulates them in the respective tensor’s ``.grad`` attribute
+# - using the chain rule, propagates all the way to the leaf tensors.
+#
+# .. note::
+#   **DAGs are dynamic in PyTorch**
+#   An important thing to note is that the graph is recreated from scratch; after each
+#   ``.backward()`` call, autograd starts populating a new graph. This is
+#   exactly what allows you to use control flow statements in your model;
+#   you can change the shape, size and operations at every iteration if
+#   needed.
+
+######################################################################
+# Optional Reading: Tensor Gradients and Jacobian Products
+# --------------------------------------------------------
+#
+# In many cases, we have a scalar loss function, and we need to compute
+# the gradient with respect to some parameters. However, there are cases
+# when the output function is an arbitrary tensor. In this case, PyTorch
+# allows you to compute so-called **Jacobian product**, and not the actual
+# gradient.
+#
+# For a vector function :math:`\vec{y}=f(\vec{x})`, where
+# :math:`\vec{x}=\langle x_1,\dots,x_n\rangle` and
+# :math:`\vec{y}=\langle y_1,\dots,y_m\rangle`, a gradient of
+# :math:`\vec{y}` with respect to :math:`\vec{x}` is given by **Jacobian
+# matrix**:
+#
+# .. math::
+#
+#
+#    J=\left(\begin{array}{ccc}
+#       \frac{\partial y_{1}}{\partial x_{1}} & \cdots & \frac{\partial y_{1}}{\partial x_{n}}\\
+#       \vdots & \ddots & \vdots\\
+#       \frac{\partial y_{m}}{\partial x_{1}} & \cdots & \frac{\partial y_{m}}{\partial x_{n}}
+#       \end{array}\right)
+#
+# Instead of computing the Jacobian matrix itself, PyTorch allows you to
+# compute **Jacobian Product** :math:`v^T\cdot J` for a given input vector
+# :math:`v=(v_1 \dots v_m)`. This is achieved by calling ``backward`` with
+# :math:`v` as an argument. The size of :math:`v` should be the same as
+# the size of the original tensor, with respect to which we want to
+# compute the product:
+#
+
+inp = torch.eye(4, 5, requires_grad=True)
+out = (inp+1).pow(2).t()
+out.backward(torch.ones_like(out), retain_graph=True)
+print(f"First call\n{inp.grad}")
+out.backward(torch.ones_like(out), retain_graph=True)
+print(f"\nSecond call\n{inp.grad}")
+inp.grad.zero_()
+out.backward(torch.ones_like(out), retain_graph=True)
+print(f"\nCall after zeroing gradients\n{inp.grad}")
+
+
+######################################################################
+# Notice that when we call ``backward`` for the second time with the same
+# argument, the value of the gradient is different. This happens because
+# when doing ``backward`` propagation, PyTorch **accumulates the
+# gradients**, i.e. the value of computed gradients is added to the
+# ``grad`` property of all leaf nodes of computational graph. If you want
+# to compute the proper gradients, you need to zero out the ``grad``
+# property before. In real-life training an *optimizer* helps us to do
+# this.
+
+######################################################################
+# .. note:: Previously we were calling ``backward()`` function without
+#           parameters. This is essentially equivalent to calling
+#           ``backward(torch.tensor(1.0))``, which is a useful way to compute the
+#           gradients in case of a scalar-valued function, such as loss during
+#           neural network training.
+#
+
+######################################################################
+# --------------
+#
+
+#################################################################
+# Further Reading
+# ~~~~~~~~~~~~~~~~~
+# - `Autograd Mechanics <https://pytorch.org/docs/stable/notes/autograd.html>`_
diff --git a/beginner_source/basics/buildmodel_tutorial.py b/beginner_source/basics/buildmodel_tutorial.py
new file mode 100644
index 00000000000..1806e80feb5
--- /dev/null
+++ b/beginner_source/basics/buildmodel_tutorial.py
@@ -0,0 +1,194 @@
+"""
+`Learn the Basics <intro.html>`_ ||
+`Quickstart <quickstart_tutorial.html>`_ ||
+`Tensors <tensorqs_tutorial.html>`_ ||
+`Datasets & DataLoaders <data_tutorial.html>`_ ||
+`Transforms <transforms_tutorial.html>`_ ||
+**Build Model** ||
+`Autograd <autogradqs_tutorial.html>`_ ||
+`Optimization <optimization_tutorial.html>`_ ||
+`Save & Load Model <saveloadrun_tutorial.html>`_
+
+Build the Neural Network
+========================
+
+Neural networks comprise of layers/modules that perform operations on data.
+The `torch.nn <https://pytorch.org/docs/stable/nn.html>`_ namespace provides all the building blocks you need to
+build your own neural network. Every module in PyTorch subclasses the `nn.Module <https://pytorch.org/docs/stable/generated/torch.nn.Module.html>`_.
+A neural network is a module itself that consists of other modules (layers). This nested structure allows for
+building and managing complex architectures easily.
+
+In the following sections, we'll build a neural network to classify images in the FashionMNIST dataset.
+
+"""
+
+import os
+import torch
+from torch import nn
+from torch.utils.data import DataLoader
+from torchvision import datasets, transforms
+
+
+#############################################
+# Get Device for Training
+# -----------------------
+# We want to be able to train our model on an `accelerator <https://pytorch.org/docs/stable/torch.html#accelerators>`__
+# such as CUDA, MPS, MTIA, or XPU. If the current accelerator is available, we will use it. Otherwise, we use the CPU.
+
+device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu"
+print(f"Using {device} device")
+
+##############################################
+# Define the Class
+# -------------------------
+# We define our neural network by subclassing ``nn.Module``, and
+# initialize the neural network layers in ``__init__``. Every ``nn.Module`` subclass implements
+# the operations on input data in the ``forward`` method.
+
+class NeuralNetwork(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.flatten = nn.Flatten()
+        self.linear_relu_stack = nn.Sequential(
+            nn.Linear(28*28, 512),
+            nn.ReLU(),
+            nn.Linear(512, 512),
+            nn.ReLU(),
+            nn.Linear(512, 10),
+        )
+
+    def forward(self, x):
+        x = self.flatten(x)
+        logits = self.linear_relu_stack(x)
+        return logits
+
+##############################################
+# We create an instance of ``NeuralNetwork``, and move it to the ``device``, and print
+# its structure.
+
+model = NeuralNetwork().to(device)
+print(model)
+
+
+##############################################
+# To use the model, we pass it the input data. This executes the model's ``forward``,
+# along with some `background operations <https://github.com/pytorch/pytorch/blob/270111b7b611d174967ed204776985cefca9c144/torch/nn/modules/module.py#L866>`_.
+# Do not call ``model.forward()`` directly!
+#
+# Calling the model on the input returns a 2-dimensional tensor with dim=0 corresponding to each output of 10 raw predicted values for each class, and dim=1 corresponding to the individual values of each output.
+# We get the prediction probabilities by passing it through an instance of the ``nn.Softmax`` module.
+
+X = torch.rand(1, 28, 28, device=device)
+logits = model(X)
+pred_probab = nn.Softmax(dim=1)(logits)
+y_pred = pred_probab.argmax(1)
+print(f"Predicted class: {y_pred}")
+
+
+######################################################################
+# --------------
+#
+
+
+##############################################
+# Model Layers
+# -------------------------
+#
+# Let's break down the layers in the FashionMNIST model. To illustrate it, we
+# will take a sample minibatch of 3 images of size 28x28 and see what happens to it as
+# we pass it through the network.
+
+input_image = torch.rand(3,28,28)
+print(input_image.size())
+
+##################################################
+# nn.Flatten
+# ^^^^^^^^^^^^^^^^^^^^^^
+# We initialize the `nn.Flatten  <https://pytorch.org/docs/stable/generated/torch.nn.Flatten.html>`_
+# layer to convert each 2D 28x28 image into a contiguous array of 784 pixel values (
+# the minibatch dimension (at dim=0) is maintained).
+
+flatten = nn.Flatten()
+flat_image = flatten(input_image)
+print(flat_image.size())
+
+##############################################
+# nn.Linear
+# ^^^^^^^^^^^^^^^^^^^^^^
+# The `linear layer <https://pytorch.org/docs/stable/generated/torch.nn.Linear.html>`_
+# is a module that applies a linear transformation on the input using its stored weights and biases.
+#
+layer1 = nn.Linear(in_features=28*28, out_features=20)
+hidden1 = layer1(flat_image)
+print(hidden1.size())
+
+
+#################################################
+# nn.ReLU
+# ^^^^^^^^^^^^^^^^^^^^^^
+# Non-linear activations are what create the complex mappings between the model's inputs and outputs.
+# They are applied after linear transformations to introduce *nonlinearity*, helping neural networks
+# learn a wide variety of phenomena.
+#
+# In this model, we use `nn.ReLU <https://pytorch.org/docs/stable/generated/torch.nn.ReLU.html>`_ between our
+# linear layers, but there's other activations to introduce non-linearity in your model.
+
+print(f"Before ReLU: {hidden1}\n\n")
+hidden1 = nn.ReLU()(hidden1)
+print(f"After ReLU: {hidden1}")
+
+
+
+#################################################
+# nn.Sequential
+# ^^^^^^^^^^^^^^^^^^^^^^
+# `nn.Sequential <https://pytorch.org/docs/stable/generated/torch.nn.Sequential.html>`_ is an ordered
+# container of modules. The data is passed through all the modules in the same order as defined. You can use
+# sequential containers to put together a quick network like ``seq_modules``.
+
+seq_modules = nn.Sequential(
+    flatten,
+    layer1,
+    nn.ReLU(),
+    nn.Linear(20, 10)
+)
+input_image = torch.rand(3,28,28)
+logits = seq_modules(input_image)
+
+################################################################
+# nn.Softmax
+# ^^^^^^^^^^^^^^^^^^^^^^
+# The last linear layer of the neural network returns `logits` - raw values in [-\infty, \infty] - which are passed to the
+# `nn.Softmax <https://pytorch.org/docs/stable/generated/torch.nn.Softmax.html>`_ module. The logits are scaled to values
+# [0, 1] representing the model's predicted probabilities for each class. ``dim`` parameter indicates the dimension along
+# which the values must sum to 1.
+
+softmax = nn.Softmax(dim=1)
+pred_probab = softmax(logits)
+
+
+#################################################
+# Model Parameters
+# -------------------------
+# Many layers inside a neural network are *parameterized*, i.e. have associated weights
+# and biases that are optimized during training. Subclassing ``nn.Module`` automatically
+# tracks all fields defined inside your model object, and makes all parameters
+# accessible using your model's ``parameters()`` or ``named_parameters()`` methods.
+#
+# In this example, we iterate over each parameter, and print its size and a preview of its values.
+#
+
+
+print(f"Model structure: {model}\n\n")
+
+for name, param in model.named_parameters():
+    print(f"Layer: {name} | Size: {param.size()} | Values : {param[:2]} \n")
+
+######################################################################
+# --------------
+#
+
+#################################################################
+# Further Reading
+# -----------------
+# - `torch.nn API <https://pytorch.org/docs/stable/nn.html>`_
diff --git a/beginner_source/basics/data_tutorial.py b/beginner_source/basics/data_tutorial.py
new file mode 100644
index 00000000000..2c46b33122a
--- /dev/null
+++ b/beginner_source/basics/data_tutorial.py
@@ -0,0 +1,247 @@
+"""
+`Learn the Basics <intro.html>`_ ||
+`Quickstart <quickstart_tutorial.html>`_ ||
+`Tensors <tensorqs_tutorial.html>`_ ||
+**Datasets & DataLoaders** ||
+`Transforms <transforms_tutorial.html>`_ ||
+`Build Model <buildmodel_tutorial.html>`_ ||
+`Autograd <autogradqs_tutorial.html>`_ ||
+`Optimization <optimization_tutorial.html>`_ ||
+`Save & Load Model <saveloadrun_tutorial.html>`_
+
+Datasets & DataLoaders
+======================
+
+"""
+
+#################################################################
+# Code for processing data samples can get messy and hard to maintain; we ideally want our dataset code
+# to be decoupled from our model training code for better readability and modularity.
+# PyTorch provides two data primitives: ``torch.utils.data.DataLoader`` and ``torch.utils.data.Dataset``
+# that allow you to use pre-loaded datasets as well as your own data.
+# ``Dataset`` stores the samples and their corresponding labels, and ``DataLoader`` wraps an iterable around
+# the ``Dataset`` to enable easy access to the samples.
+#
+# PyTorch domain libraries provide a number of pre-loaded datasets (such as FashionMNIST) that
+# subclass ``torch.utils.data.Dataset`` and implement functions specific to the particular data.
+# They can be used to prototype and benchmark your model. You can find them
+# here: `Image Datasets <https://pytorch.org/vision/stable/datasets.html>`_,
+# `Text Datasets  <https://pytorch.org/text/stable/datasets.html>`_, and
+# `Audio Datasets <https://pytorch.org/audio/stable/datasets.html>`_
+#
+
+############################################################
+# Loading a Dataset
+# -------------------
+#
+# Here is an example of how to load the `Fashion-MNIST <https://research.zalando.com/project/fashion_mnist/fashion_mnist/>`_ dataset from TorchVision.
+# Fashion-MNIST is a dataset of Zalando’s article images consisting of 60,000 training examples and 10,000 test examples.
+# Each example comprises a 28×28 grayscale image and an associated label from one of 10 classes.
+#
+# We load the `FashionMNIST Dataset <https://pytorch.org/vision/stable/datasets.html#fashion-mnist>`_ with the following parameters:
+#  - ``root`` is the path where the train/test data is stored,
+#  - ``train`` specifies training or test dataset,
+#  - ``download=True`` downloads the data from the internet if it's not available at ``root``.
+#  - ``transform`` and ``target_transform`` specify the feature and label transformations
+
+
+import torch
+from torch.utils.data import Dataset
+from torchvision import datasets
+from torchvision.transforms import ToTensor
+import matplotlib.pyplot as plt
+
+
+training_data = datasets.FashionMNIST(
+    root="data",
+    train=True,
+    download=True,
+    transform=ToTensor()
+)
+
+test_data = datasets.FashionMNIST(
+    root="data",
+    train=False,
+    download=True,
+    transform=ToTensor()
+)
+
+
+#################################################################
+# Iterating and Visualizing the Dataset
+# -------------------------------------
+#
+# We can index ``Datasets`` manually like a list: ``training_data[index]``.
+# We use ``matplotlib`` to visualize some samples in our training data.
+
+labels_map = {
+    0: "T-Shirt",
+    1: "Trouser",
+    2: "Pullover",
+    3: "Dress",
+    4: "Coat",
+    5: "Sandal",
+    6: "Shirt",
+    7: "Sneaker",
+    8: "Bag",
+    9: "Ankle Boot",
+}
+figure = plt.figure(figsize=(8, 8))
+cols, rows = 3, 3
+for i in range(1, cols * rows + 1):
+    sample_idx = torch.randint(len(training_data), size=(1,)).item()
+    img, label = training_data[sample_idx]
+    figure.add_subplot(rows, cols, i)
+    plt.title(labels_map[label])
+    plt.axis("off")
+    plt.imshow(img.squeeze(), cmap="gray")
+plt.show()
+
+#################################################################
+# ..
+#  .. figure:: /_static/img/basics/fashion_mnist.png
+#    :alt: fashion_mnist
+
+
+######################################################################
+# --------------
+#
+
+#################################################################
+# Creating a Custom Dataset for your files
+# ---------------------------------------------------
+#
+# A custom Dataset class must implement three functions: `__init__`, `__len__`, and `__getitem__`.
+# Take a look at this implementation; the FashionMNIST images are stored
+# in a directory ``img_dir``, and their labels are stored separately in a CSV file ``annotations_file``.
+#
+# In the next sections, we'll break down what's happening in each of these functions.
+
+
+import os
+import pandas as pd
+from torchvision.io import decode_image
+
+class CustomImageDataset(Dataset):
+    def __init__(self, annotations_file, img_dir, transform=None, target_transform=None):
+        self.img_labels = pd.read_csv(annotations_file)
+        self.img_dir = img_dir
+        self.transform = transform
+        self.target_transform = target_transform
+
+    def __len__(self):
+        return len(self.img_labels)
+
+    def __getitem__(self, idx):
+        img_path = os.path.join(self.img_dir, self.img_labels.iloc[idx, 0])
+        image = decode_image(img_path)
+        label = self.img_labels.iloc[idx, 1]
+        if self.transform:
+            image = self.transform(image)
+        if self.target_transform:
+            label = self.target_transform(label)
+        return image, label
+
+
+#################################################################
+# ``__init__``
+# ^^^^^^^^^^^^^^^^^^^^
+#
+# The __init__ function is run once when instantiating the Dataset object. We initialize
+# the directory containing the images, the annotations file, and both transforms (covered
+# in more detail in the next section).
+#
+# The labels.csv file looks like: ::
+#
+#     tshirt1.jpg, 0
+#     tshirt2.jpg, 0
+#     ......
+#     ankleboot999.jpg, 9
+
+
+def __init__(self, annotations_file, img_dir, transform=None, target_transform=None):
+    self.img_labels = pd.read_csv(annotations_file)
+    self.img_dir = img_dir
+    self.transform = transform
+    self.target_transform = target_transform
+
+
+#################################################################
+# ``__len__``
+# ^^^^^^^^^^^^^^^^^^^^
+#
+# The __len__ function returns the number of samples in our dataset.
+#
+# Example:
+
+
+def __len__(self):
+    return len(self.img_labels)
+
+
+#################################################################
+# ``__getitem__``
+# ^^^^^^^^^^^^^^^^^^^^
+#
+# The __getitem__ function loads and returns a sample from the dataset at the given index ``idx``.
+# Based on the index, it identifies the image's location on disk, converts that to a tensor using ``decode_image``, retrieves the
+# corresponding label from the csv data in ``self.img_labels``, calls the transform functions on them (if applicable), and returns the
+# tensor image and corresponding label in a tuple.
+
+def __getitem__(self, idx):
+    img_path = os.path.join(self.img_dir, self.img_labels.iloc[idx, 0])
+    image = decode_image(img_path)
+    label = self.img_labels.iloc[idx, 1]
+    if self.transform:
+        image = self.transform(image)
+    if self.target_transform:
+        label = self.target_transform(label)
+    return image, label
+
+
+######################################################################
+# --------------
+#
+
+
+#################################################################
+# Preparing your data for training with DataLoaders
+# -------------------------------------------------
+# The ``Dataset`` retrieves our dataset's features and labels one sample at a time. While training a model, we typically want to
+# pass samples in "minibatches", reshuffle the data at every epoch to reduce model overfitting, and use Python's ``multiprocessing`` to
+# speed up data retrieval.
+#
+# ``DataLoader`` is an iterable that abstracts this complexity for us in an easy API.
+
+from torch.utils.data import DataLoader
+
+train_dataloader = DataLoader(training_data, batch_size=64, shuffle=True)
+test_dataloader = DataLoader(test_data, batch_size=64, shuffle=True)
+
+###########################
+# Iterate through the DataLoader
+# -------------------------------
+#
+# We have loaded that dataset into the ``DataLoader`` and can iterate through the dataset as needed.
+# Each iteration below returns a batch of ``train_features`` and ``train_labels`` (containing ``batch_size=64`` features and labels respectively).
+# Because we specified ``shuffle=True``, after we iterate over all batches the data is shuffled (for finer-grained control over
+# the data loading order, take a look at `Samplers <https://pytorch.org/docs/stable/data.html#data-loading-order-and-sampler>`_).
+
+# Display image and label.
+train_features, train_labels = next(iter(train_dataloader))
+print(f"Feature batch shape: {train_features.size()}")
+print(f"Labels batch shape: {train_labels.size()}")
+img = train_features[0].squeeze()
+label = train_labels[0]
+plt.imshow(img, cmap="gray")
+plt.show()
+print(f"Label: {label}")
+
+######################################################################
+# --------------
+#
+
+#################################################################
+# Further Reading
+# ----------------
+# - `torch.utils.data API <https://pytorch.org/docs/stable/data.html>`_
diff --git a/beginner_source/basics/intro.py b/beginner_source/basics/intro.py
new file mode 100644
index 00000000000..30ff5e17ff6
--- /dev/null
+++ b/beginner_source/basics/intro.py
@@ -0,0 +1,64 @@
+"""
+**Learn the Basics** ||
+`Quickstart <quickstart_tutorial.html>`_ ||
+`Tensors <tensorqs_tutorial.html>`_ ||
+`Datasets & DataLoaders <data_tutorial.html>`_ ||
+`Transforms <transforms_tutorial.html>`_ ||
+`Build Model <buildmodel_tutorial.html>`_ ||
+`Autograd <autogradqs_tutorial.html>`_ ||
+`Optimization <optimization_tutorial.html>`_ ||
+`Save & Load Model <saveloadrun_tutorial.html>`_
+
+Learn the Basics
+===================
+
+Authors:
+`Suraj Subramanian <https://github.com/subramen>`_,
+`Seth Juarez <https://github.com/sethjuarez/>`_,
+`Cassie Breviu <https://github.com/cassiebreviu/>`_,
+`Dmitry Soshnikov <https://soshnikov.com/>`_,
+`Ari Bornstein <https://github.com/aribornstein/>`_
+
+Most machine learning workflows involve working with data, creating models, optimizing model
+parameters, and saving the trained models. This tutorial introduces you to a complete ML workflow
+implemented in PyTorch, with links to learn more about each of these concepts.
+
+We'll use the FashionMNIST dataset to train a neural network that predicts if an input image belongs
+to one of the following classes: T-shirt/top, Trouser, Pullover, Dress, Coat, Sandal, Shirt, Sneaker,
+Bag, or Ankle boot.
+
+`This tutorial assumes a basic familiarity with Python and Deep Learning concepts.`
+
+
+Running the Tutorial Code
+-------------------------
+You can run this tutorial in a couple of ways:
+
+- **In the cloud**: This is the easiest way to get started! Each section has a "Run in Microsoft Learn" and "Run in Google Colab" link at the top, which opens an integrated notebook in Microsoft Learn or Google Colab, respectively, with the code in a fully-hosted environment.
+- **Locally**: This option requires you to setup PyTorch and TorchVision first on your local machine (`installation instructions <https://pytorch.org/get-started/locally/>`_). Download the notebook or copy the code into your favorite IDE.
+
+
+How to Use this Guide
+---------------------
+If you're familiar with other deep learning frameworks, check out the `0. Quickstart <quickstart_tutorial.html>`_ first
+to quickly familiarize yourself with PyTorch's API.
+
+If you're new to deep learning frameworks, head right into the first section of our step-by-step guide: `1. Tensors <tensorqs_tutorial.html>`_.
+
+
+.. include:: /beginner_source/basics/qs_toc.txt
+
+.. toctree::
+   :maxdepth: 2
+   :hidden:
+
+   quickstart_tutorial
+   tensorqs_tutorial
+   data_tutorial
+   transforms_tutorial
+   buildmodel_tutorial
+   autogradqs_tutorial
+   optimization_tutorial
+   saveloadrun_tutorial
+
+"""
diff --git a/beginner_source/basics/optimization_tutorial.py b/beginner_source/basics/optimization_tutorial.py
new file mode 100644
index 00000000000..82bfaa8f07c
--- /dev/null
+++ b/beginner_source/basics/optimization_tutorial.py
@@ -0,0 +1,213 @@
+"""
+`Learn the Basics <intro.html>`_ ||
+`Quickstart <quickstart_tutorial.html>`_ ||
+`Tensors <tensorqs_tutorial.html>`_ ||
+`Datasets & DataLoaders <data_tutorial.html>`_ ||
+`Transforms <transforms_tutorial.html>`_ ||
+`Build Model <buildmodel_tutorial.html>`_ ||
+`Autograd <autogradqs_tutorial.html>`_ ||
+**Optimization** ||
+`Save & Load Model <saveloadrun_tutorial.html>`_
+
+Optimizing Model Parameters
+===========================
+
+Now that we have a model and data it's time to train, validate and test our model by optimizing its parameters on
+our data. Training a model is an iterative process; in each iteration the model makes a guess about the output, calculates
+the error in its guess (*loss*), collects the derivatives of the error with respect to its parameters (as we saw in
+the `previous section  <autograd_tutorial.html>`_), and **optimizes** these parameters using gradient descent. For a more
+detailed walkthrough of this process, check out this video on `backpropagation from 3Blue1Brown <https://www.youtube.com/watch?v=tIeHLnjs5U8>`__.
+
+Prerequisite Code
+-----------------
+We load the code from the previous sections on `Datasets & DataLoaders <data_tutorial.html>`_
+and `Build Model  <buildmodel_tutorial.html>`_.
+"""
+
+import torch
+from torch import nn
+from torch.utils.data import DataLoader
+from torchvision import datasets
+from torchvision.transforms import ToTensor
+
+training_data = datasets.FashionMNIST(
+    root="data",
+    train=True,
+    download=True,
+    transform=ToTensor()
+)
+
+test_data = datasets.FashionMNIST(
+    root="data",
+    train=False,
+    download=True,
+    transform=ToTensor()
+)
+
+train_dataloader = DataLoader(training_data, batch_size=64)
+test_dataloader = DataLoader(test_data, batch_size=64)
+
+class NeuralNetwork(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.flatten = nn.Flatten()
+        self.linear_relu_stack = nn.Sequential(
+            nn.Linear(28*28, 512),
+            nn.ReLU(),
+            nn.Linear(512, 512),
+            nn.ReLU(),
+            nn.Linear(512, 10),
+        )
+
+    def forward(self, x):
+        x = self.flatten(x)
+        logits = self.linear_relu_stack(x)
+        return logits
+
+model = NeuralNetwork()
+
+
+##############################################
+# Hyperparameters
+# -----------------
+#
+# Hyperparameters are adjustable parameters that let you control the model optimization process.
+# Different hyperparameter values can impact model training and convergence rates
+# (`read more <https://pytorch.org/tutorials/beginner/hyperparameter_tuning_tutorial.html>`__ about hyperparameter tuning)
+#
+# We define the following hyperparameters for training:
+#  - **Number of Epochs** - the number of times to iterate over the dataset
+#  - **Batch Size** - the number of data samples propagated through the network before the parameters are updated
+#  - **Learning Rate** - how much to update models parameters at each batch/epoch. Smaller values yield slow learning speed, while large values may result in unpredictable behavior during training.
+#
+
+learning_rate = 1e-3
+batch_size = 64
+epochs = 5
+
+
+
+#####################################
+# Optimization Loop
+# -----------------
+#
+# Once we set our hyperparameters, we can then train and optimize our model with an optimization loop. Each
+# iteration of the optimization loop is called an **epoch**.
+#
+# Each epoch consists of two main parts:
+#  - **The Train Loop** - iterate over the training dataset and try to converge to optimal parameters.
+#  - **The Validation/Test Loop** - iterate over the test dataset to check if model performance is improving.
+#
+# Let's briefly familiarize ourselves with some of the concepts used in the training loop. Jump ahead to
+# see the :ref:`full-impl-label` of the optimization loop.
+#
+# Loss Function
+# ~~~~~~~~~~~~~~~~~
+#
+# When presented with some training data, our untrained network is likely not to give the correct
+# answer. **Loss function** measures the degree of dissimilarity of obtained result to the target value,
+# and it is the loss function that we want to minimize during training. To calculate the loss we make a
+# prediction using the inputs of our given data sample and compare it against the true data label value.
+#
+# Common loss functions include `nn.MSELoss <https://pytorch.org/docs/stable/generated/torch.nn.MSELoss.html#torch.nn.MSELoss>`_ (Mean Square Error) for regression tasks, and
+# `nn.NLLLoss <https://pytorch.org/docs/stable/generated/torch.nn.NLLLoss.html#torch.nn.NLLLoss>`_ (Negative Log Likelihood) for classification.
+# `nn.CrossEntropyLoss <https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html#torch.nn.CrossEntropyLoss>`_ combines ``nn.LogSoftmax`` and ``nn.NLLLoss``.
+#
+# We pass our model's output logits to ``nn.CrossEntropyLoss``, which will normalize the logits and compute the prediction error.
+
+# Initialize the loss function
+loss_fn = nn.CrossEntropyLoss()
+
+
+
+#####################################
+# Optimizer
+# ~~~~~~~~~~~~~~~~~
+#
+# Optimization is the process of adjusting model parameters to reduce model error in each training step. **Optimization algorithms** define how this process is performed (in this example we use Stochastic Gradient Descent).
+# All optimization logic is encapsulated in  the ``optimizer`` object. Here, we use the SGD optimizer; additionally, there are many `different optimizers <https://pytorch.org/docs/stable/optim.html>`_
+# available in PyTorch such as ADAM and RMSProp, that work better for different kinds of models and data.
+#
+# We initialize the optimizer by registering the model's parameters that need to be trained, and passing in the learning rate hyperparameter.
+
+optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
+
+#####################################
+# Inside the training loop, optimization happens in three steps:
+#  * Call ``optimizer.zero_grad()`` to reset the gradients of model parameters. Gradients by default add up; to prevent double-counting, we explicitly zero them at each iteration.
+#  * Backpropagate the prediction loss with a call to ``loss.backward()``. PyTorch deposits the gradients of the loss w.r.t. each parameter.
+#  * Once we have our gradients, we call ``optimizer.step()`` to adjust the parameters by the gradients collected in the backward pass.
+
+
+########################################
+# .. _full-impl-label:
+#
+# Full Implementation
+# -----------------------
+# We define ``train_loop`` that loops over our optimization code, and ``test_loop`` that
+# evaluates the model's performance against our test data.
+
+def train_loop(dataloader, model, loss_fn, optimizer):
+    size = len(dataloader.dataset)
+    # Set the model to training mode - important for batch normalization and dropout layers
+    # Unnecessary in this situation but added for best practices
+    model.train()
+    for batch, (X, y) in enumerate(dataloader):
+        # Compute prediction and loss
+        pred = model(X)
+        loss = loss_fn(pred, y)
+
+        # Backpropagation
+        loss.backward()
+        optimizer.step()
+        optimizer.zero_grad()
+
+        if batch % 100 == 0:
+            loss, current = loss.item(), batch * batch_size + len(X)
+            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")
+
+
+def test_loop(dataloader, model, loss_fn):
+    # Set the model to evaluation mode - important for batch normalization and dropout layers
+    # Unnecessary in this situation but added for best practices
+    model.eval()
+    size = len(dataloader.dataset)
+    num_batches = len(dataloader)
+    test_loss, correct = 0, 0
+
+    # Evaluating the model with torch.no_grad() ensures that no gradients are computed during test mode
+    # also serves to reduce unnecessary gradient computations and memory usage for tensors with requires_grad=True
+    with torch.no_grad():
+        for X, y in dataloader:
+            pred = model(X)
+            test_loss += loss_fn(pred, y).item()
+            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
+
+    test_loss /= num_batches
+    correct /= size
+    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
+
+
+########################################
+# We initialize the loss function and optimizer, and pass it to ``train_loop`` and ``test_loop``.
+# Feel free to increase the number of epochs to track the model's improving performance.
+
+loss_fn = nn.CrossEntropyLoss()
+optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
+
+epochs = 10
+for t in range(epochs):
+    print(f"Epoch {t+1}\n-------------------------------")
+    train_loop(train_dataloader, model, loss_fn, optimizer)
+    test_loop(test_dataloader, model, loss_fn)
+print("Done!")
+
+
+
+#################################################################
+# Further Reading
+# -----------------------
+# - `Loss Functions <https://pytorch.org/docs/stable/nn.html#loss-functions>`_
+# - `torch.optim <https://pytorch.org/docs/stable/optim.html>`_
+# - `Warmstart Training a Model <https://pytorch.org/tutorials/recipes/recipes/warmstarting_model_using_parameters_from_a_different_model.html>`_
+#
diff --git a/beginner_source/basics/qs_toc.txt b/beginner_source/basics/qs_toc.txt
new file mode 100644
index 00000000000..7a63e1c7e9d
--- /dev/null
+++ b/beginner_source/basics/qs_toc.txt
@@ -0,0 +1,8 @@
+| 0. `Quickstart <quickstart_tutorial.html>`_
+| 1. `Tensors <tensorqs_tutorial.html>`_
+| 2. `Datasets and DataLoaders <data_tutorial.html>`_
+| 3. `Transforms <transforms_tutorial.html>`_
+| 4. `Build Model <buildmodel_tutorial.html>`_
+| 5. `Automatic Differentiation <autogradqs_tutorial.html>`_
+| 6. `Optimization Loop <optimization_tutorial.html>`_
+| 7. `Save, Load and Use Model <saveloadrun_tutorial.html>`_
diff --git a/beginner_source/basics/quickstart_tutorial.py b/beginner_source/basics/quickstart_tutorial.py
new file mode 100644
index 00000000000..5cce8dcfe9a
--- /dev/null
+++ b/beginner_source/basics/quickstart_tutorial.py
@@ -0,0 +1,242 @@
+"""
+`Learn the Basics <intro.html>`_ ||
+**Quickstart** ||
+`Tensors <tensorqs_tutorial.html>`_ ||
+`Datasets & DataLoaders <data_tutorial.html>`_ ||
+`Transforms <transforms_tutorial.html>`_ ||
+`Build Model <buildmodel_tutorial.html>`_ ||
+`Autograd <autogradqs_tutorial.html>`_ ||
+`Optimization <optimization_tutorial.html>`_ ||
+`Save & Load Model <saveloadrun_tutorial.html>`_
+
+Quickstart
+===================
+This section runs through the API for common tasks in machine learning. Refer to the links in each section to dive deeper.
+
+Working with data
+-----------------
+PyTorch has two `primitives to work with data <https://pytorch.org/docs/stable/data.html>`_:
+``torch.utils.data.DataLoader`` and ``torch.utils.data.Dataset``.
+``Dataset`` stores the samples and their corresponding labels, and ``DataLoader`` wraps an iterable around
+the ``Dataset``.
+
+"""
+
+import torch
+from torch import nn
+from torch.utils.data import DataLoader
+from torchvision import datasets
+from torchvision.transforms import ToTensor
+
+######################################################################
+# PyTorch offers domain-specific libraries such as `TorchText <https://pytorch.org/text/stable/index.html>`_,
+# `TorchVision <https://pytorch.org/vision/stable/index.html>`_, and `TorchAudio <https://pytorch.org/audio/stable/index.html>`_,
+# all of which include datasets. For this tutorial, we  will be using a TorchVision dataset.
+#
+# The ``torchvision.datasets`` module contains ``Dataset`` objects for many real-world vision data like
+# CIFAR, COCO (`full list here <https://pytorch.org/vision/stable/datasets.html>`_). In this tutorial, we
+# use the FashionMNIST dataset. Every TorchVision ``Dataset`` includes two arguments: ``transform`` and
+# ``target_transform`` to modify the samples and labels respectively.
+
+# Download training data from open datasets.
+training_data = datasets.FashionMNIST(
+    root="data",
+    train=True,
+    download=True,
+    transform=ToTensor(),
+)
+
+# Download test data from open datasets.
+test_data = datasets.FashionMNIST(
+    root="data",
+    train=False,
+    download=True,
+    transform=ToTensor(),
+)
+
+######################################################################
+# We pass the ``Dataset`` as an argument to ``DataLoader``. This wraps an iterable over our dataset, and supports
+# automatic batching, sampling, shuffling and multiprocess data loading. Here we define a batch size of 64, i.e. each element
+# in the dataloader iterable will return a batch of 64 features and labels.
+
+batch_size = 64
+
+# Create data loaders.
+train_dataloader = DataLoader(training_data, batch_size=batch_size)
+test_dataloader = DataLoader(test_data, batch_size=batch_size)
+
+for X, y in test_dataloader:
+    print(f"Shape of X [N, C, H, W]: {X.shape}")
+    print(f"Shape of y: {y.shape} {y.dtype}")
+    break
+
+######################################################################
+# Read more about `loading data in PyTorch <data_tutorial.html>`_.
+#
+
+######################################################################
+# --------------
+#
+
+################################
+# Creating Models
+# ------------------
+# To define a neural network in PyTorch, we create a class that inherits
+# from `nn.Module <https://pytorch.org/docs/stable/generated/torch.nn.Module.html>`_. We define the layers of the network
+# in the ``__init__`` function and specify how data will pass through the network in the ``forward`` function. To accelerate
+# operations in the neural network, we move it to the `accelerator <https://pytorch.org/docs/stable/torch.html#accelerators>`__
+# such as CUDA, MPS, MTIA, or XPU. If the current accelerator is available, we will use it. Otherwise, we use the CPU.
+
+device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu"
+print(f"Using {device} device")
+
+# Define model
+class NeuralNetwork(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.flatten = nn.Flatten()
+        self.linear_relu_stack = nn.Sequential(
+            nn.Linear(28*28, 512),
+            nn.ReLU(),
+            nn.Linear(512, 512),
+            nn.ReLU(),
+            nn.Linear(512, 10)
+        )
+
+    def forward(self, x):
+        x = self.flatten(x)
+        logits = self.linear_relu_stack(x)
+        return logits
+
+model = NeuralNetwork().to(device)
+print(model)
+
+######################################################################
+# Read more about `building neural networks in PyTorch <buildmodel_tutorial.html>`_.
+#
+
+
+######################################################################
+# --------------
+#
+
+
+#####################################################################
+# Optimizing the Model Parameters
+# ----------------------------------------
+# To train a model, we need a `loss function <https://pytorch.org/docs/stable/nn.html#loss-functions>`_
+# and an `optimizer <https://pytorch.org/docs/stable/optim.html>`_.
+
+loss_fn = nn.CrossEntropyLoss()
+optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
+
+
+#######################################################################
+# In a single training loop, the model makes predictions on the training dataset (fed to it in batches), and
+# backpropagates the prediction error to adjust the model's parameters.
+
+def train(dataloader, model, loss_fn, optimizer):
+    size = len(dataloader.dataset)
+    model.train()
+    for batch, (X, y) in enumerate(dataloader):
+        X, y = X.to(device), y.to(device)
+
+        # Compute prediction error
+        pred = model(X)
+        loss = loss_fn(pred, y)
+
+        # Backpropagation
+        loss.backward()
+        optimizer.step()
+        optimizer.zero_grad()
+
+        if batch % 100 == 0:
+            loss, current = loss.item(), (batch + 1) * len(X)
+            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")
+
+##############################################################################
+# We also check the model's performance against the test dataset to ensure it is learning.
+
+def test(dataloader, model, loss_fn):
+    size = len(dataloader.dataset)
+    num_batches = len(dataloader)
+    model.eval()
+    test_loss, correct = 0, 0
+    with torch.no_grad():
+        for X, y in dataloader:
+            X, y = X.to(device), y.to(device)
+            pred = model(X)
+            test_loss += loss_fn(pred, y).item()
+            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
+    test_loss /= num_batches
+    correct /= size
+    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
+
+##############################################################################
+# The training process is conducted over several iterations (*epochs*). During each epoch, the model learns
+# parameters to make better predictions. We print the model's accuracy and loss at each epoch; we'd like to see the
+# accuracy increase and the loss decrease with every epoch.
+
+epochs = 5
+for t in range(epochs):
+    print(f"Epoch {t+1}\n-------------------------------")
+    train(train_dataloader, model, loss_fn, optimizer)
+    test(test_dataloader, model, loss_fn)
+print("Done!")
+
+######################################################################
+# Read more about `Training your model <optimization_tutorial.html>`_.
+#
+
+######################################################################
+# --------------
+#
+
+######################################################################
+# Saving Models
+# -------------
+# A common way to save a model is to serialize the internal state dictionary (containing the model parameters).
+
+torch.save(model.state_dict(), "model.pth")
+print("Saved PyTorch Model State to model.pth")
+
+
+
+######################################################################
+# Loading Models
+# ----------------------------
+#
+# The process for loading a model includes re-creating the model structure and loading
+# the state dictionary into it.
+
+model = NeuralNetwork().to(device)
+model.load_state_dict(torch.load("model.pth", weights_only=True))
+
+#############################################################
+# This model can now be used to make predictions.
+
+classes = [
+    "T-shirt/top",
+    "Trouser",
+    "Pullover",
+    "Dress",
+    "Coat",
+    "Sandal",
+    "Shirt",
+    "Sneaker",
+    "Bag",
+    "Ankle boot",
+]
+
+model.eval()
+x, y = test_data[0][0], test_data[0][1]
+with torch.no_grad():
+    x = x.to(device)
+    pred = model(x)
+    predicted, actual = classes[pred[0].argmax(0)], classes[y]
+    print(f'Predicted: "{predicted}", Actual: "{actual}"')
+
+
+######################################################################
+# Read more about `Saving & Loading your model <saveloadrun_tutorial.html>`_.
+#
diff --git a/beginner_source/basics/saveloadrun_tutorial.py b/beginner_source/basics/saveloadrun_tutorial.py
new file mode 100644
index 00000000000..e80d32a6eaa
--- /dev/null
+++ b/beginner_source/basics/saveloadrun_tutorial.py
@@ -0,0 +1,74 @@
+"""
+`Learn the Basics <intro.html>`_ ||
+`Quickstart <quickstart_tutorial.html>`_ ||
+`Tensors <tensorqs_tutorial.html>`_ ||
+`Datasets & DataLoaders <data_tutorial.html>`_ ||
+`Transforms <transforms_tutorial.html>`_ ||
+`Build Model <buildmodel_tutorial.html>`_ ||
+`Autograd <autogradqs_tutorial.html>`_ ||
+`Optimization <optimization_tutorial.html>`_ ||
+**Save & Load Model**
+
+Save and Load the Model
+============================
+
+In this section we will look at how to persist model state with saving, loading and running model predictions.
+"""
+
+import torch
+import torchvision.models as models
+
+
+#######################################################################
+# Saving and Loading Model Weights
+# --------------------------------
+# PyTorch models store the learned parameters in an internal
+# state dictionary, called ``state_dict``. These can be persisted via the ``torch.save``
+# method:
+
+model = models.vgg16(weights='IMAGENET1K_V1')
+torch.save(model.state_dict(), 'model_weights.pth')
+
+##########################
+# To load model weights, you need to create an instance of the same model first, and then load the parameters
+# using ``load_state_dict()`` method.
+#
+# In the code below, we set ``weights_only=True`` to limit the
+# functions executed during unpickling to only those necessary for
+# loading weights. Using ``weights_only=True`` is considered
+# a best practice when loading weights.
+
+model = models.vgg16() # we do not specify ``weights``, i.e. create untrained model
+model.load_state_dict(torch.load('model_weights.pth', weights_only=True))
+model.eval()
+
+###########################
+# .. note:: be sure to call ``model.eval()`` method before inferencing to set the dropout and batch normalization layers to evaluation mode. Failing to do this will yield inconsistent inference results.
+
+#######################################################################
+# Saving and Loading Models with Shapes
+# -------------------------------------
+# When loading model weights, we needed to instantiate the model class first, because the class
+# defines the structure of a network. We might want to save the structure of this class together with
+# the model, in which case we can pass ``model`` (and not ``model.state_dict()``) to the saving function:
+
+torch.save(model, 'model.pth')
+
+########################
+# We can then load the model as demonstrated below.
+#
+# As described in `Saving and loading torch.nn.Modules <https://pytorch.org/docs/main/notes/serialization.html#saving-and-loading-torch-nn-modules>`_,
+# saving ``state_dict`` is considered the best practice. However,
+# below we use ``weights_only=False`` because this involves loading the
+# model, which is a legacy use case for ``torch.save``.
+
+model = torch.load('model.pth', weights_only=False)
+
+########################
+# .. note:: This approach uses Python `pickle <https://docs.python.org/3/library/pickle.html>`_ module when serializing the model, thus it relies on the actual class definition to be available when loading the model.
+
+#######################
+# Related Tutorials
+# -----------------
+# - `Saving and Loading a General Checkpoint in PyTorch <https://pytorch.org/tutorials/recipes/recipes/saving_and_loading_a_general_checkpoint.html>`_
+# - `Tips for loading an nn.Module from a checkpoint <https://pytorch.org/tutorials/recipes/recipes/module_load_state_dict_tips.html?highlight=loading%20nn%20module%20from%20checkpoint>`_
diff --git a/beginner_source/basics/tensorqs_tutorial.py b/beginner_source/basics/tensorqs_tutorial.py
new file mode 100644
index 00000000000..30e05cb10d0
--- /dev/null
+++ b/beginner_source/basics/tensorqs_tutorial.py
@@ -0,0 +1,227 @@
+"""
+`Learn the Basics <intro.html>`_ ||
+`Quickstart <quickstart_tutorial.html>`_ ||
+**Tensors** ||
+`Datasets & DataLoaders <data_tutorial.html>`_ ||
+`Transforms <transforms_tutorial.html>`_ ||
+`Build Model <buildmodel_tutorial.html>`_ ||
+`Autograd <autogradqs_tutorial.html>`_ ||
+`Optimization <optimization_tutorial.html>`_ ||
+`Save & Load Model <saveloadrun_tutorial.html>`_
+
+Tensors
+==========================
+
+Tensors are a specialized data structure that are very similar to arrays and matrices.
+In PyTorch, we use tensors to encode the inputs and outputs of a model, as well as the model’s parameters.
+
+Tensors are similar to `NumPy’s <https://numpy.org/>`_ ndarrays, except that tensors can run on GPUs or other hardware accelerators. In fact, tensors and
+NumPy arrays can often share the same underlying memory, eliminating the need to copy data (see :ref:`bridge-to-np-label`). Tensors
+are also optimized for automatic differentiation (we'll see more about that later in the `Autograd <autogradqs_tutorial.html>`__
+section). If you’re familiar with ndarrays, you’ll be right at home with the Tensor API. If not, follow along!
+"""
+
+import torch
+import numpy as np
+
+
+######################################################################
+# Initializing a Tensor
+# ~~~~~~~~~~~~~~~~~~~~~
+#
+# Tensors can be initialized in various ways. Take a look at the following examples:
+#
+# **Directly from data**
+#
+# Tensors can be created directly from data. The data type is automatically inferred.
+
+data = [[1, 2],[3, 4]]
+x_data = torch.tensor(data)
+
+######################################################################
+# **From a NumPy array**
+#
+# Tensors can be created from NumPy arrays (and vice versa - see :ref:`bridge-to-np-label`).
+np_array = np.array(data)
+x_np = torch.from_numpy(np_array)
+
+
+###############################################################
+# **From another tensor:**
+#
+# The new tensor retains the properties (shape, datatype) of the argument tensor, unless explicitly overridden.
+
+x_ones = torch.ones_like(x_data) # retains the properties of x_data
+print(f"Ones Tensor: \n {x_ones} \n")
+
+x_rand = torch.rand_like(x_data, dtype=torch.float) # overrides the datatype of x_data
+print(f"Random Tensor: \n {x_rand} \n")
+
+
+######################################################################
+# **With random or constant values:**
+#
+# ``shape`` is a tuple of tensor dimensions. In the functions below, it determines the dimensionality of the output tensor.
+
+shape = (2,3,)
+rand_tensor = torch.rand(shape)
+ones_tensor = torch.ones(shape)
+zeros_tensor = torch.zeros(shape)
+
+print(f"Random Tensor: \n {rand_tensor} \n")
+print(f"Ones Tensor: \n {ones_tensor} \n")
+print(f"Zeros Tensor: \n {zeros_tensor}")
+
+
+
+######################################################################
+# --------------
+#
+
+######################################################################
+# Attributes of a Tensor
+# ~~~~~~~~~~~~~~~~~~~~~~
+#
+# Tensor attributes describe their shape, datatype, and the device on which they are stored.
+
+tensor = torch.rand(3,4)
+
+print(f"Shape of tensor: {tensor.shape}")
+print(f"Datatype of tensor: {tensor.dtype}")
+print(f"Device tensor is stored on: {tensor.device}")
+
+
+######################################################################
+# --------------
+#
+
+######################################################################
+# Operations on Tensors
+# ~~~~~~~~~~~~~~~~~~~~~~~
+#
+# Over 1200 tensor operations, including arithmetic, linear algebra, matrix manipulation (transposing,
+# indexing, slicing), sampling and more are
+# comprehensively described `here <https://pytorch.org/docs/stable/torch.html>`__.
+#
+# Each of these operations can be run on the CPU and `Accelerator <https://pytorch.org/docs/stable/torch.html#accelerators>`__
+# such as CUDA, MPS, MTIA, or XPU. If you’re using Colab, allocate an accelerator by going to Runtime > Change runtime type > GPU.
+#
+# By default, tensors are created on the CPU. We need to explicitly move tensors to the accelerator using
+# ``.to`` method (after checking for accelerator availability). Keep in mind that copying large tensors
+# across devices can be expensive in terms of time and memory!
+
+# We move our tensor to the current accelerator if available
+if torch.accelerator.is_available():
+    tensor = tensor.to(torch.accelerator.current_accelerator())
+
+
+######################################################################
+# Try out some of the operations from the list.
+# If you're familiar with the NumPy API, you'll find the Tensor API a breeze to use.
+#
+
+###############################################################
+# **Standard numpy-like indexing and slicing:**
+
+tensor = torch.ones(4, 4)
+print(f"First row: {tensor[0]}")
+print(f"First column: {tensor[:, 0]}")
+print(f"Last column: {tensor[..., -1]}")
+tensor[:,1] = 0
+print(tensor)
+
+######################################################################
+# **Joining tensors** You can use ``torch.cat`` to concatenate a sequence of tensors along a given dimension.
+# See also `torch.stack <https://pytorch.org/docs/stable/generated/torch.stack.html>`__,
+# another tensor joining operator that is subtly different from ``torch.cat``.
+t1 = torch.cat([tensor, tensor, tensor], dim=1)
+print(t1)
+
+
+######################################################################
+# **Arithmetic operations**
+
+# This computes the matrix multiplication between two tensors. y1, y2, y3 will have the same value
+# ``tensor.T`` returns the transpose of a tensor
+y1 = tensor @ tensor.T
+y2 = tensor.matmul(tensor.T)
+
+y3 = torch.rand_like(y1)
+torch.matmul(tensor, tensor.T, out=y3)
+
+
+# This computes the element-wise product. z1, z2, z3 will have the same value
+z1 = tensor * tensor
+z2 = tensor.mul(tensor)
+
+z3 = torch.rand_like(tensor)
+torch.mul(tensor, tensor, out=z3)
+
+
+######################################################################
+# **Single-element tensors** If you have a one-element tensor, for example by aggregating all
+# values of a tensor into one value, you can convert it to a Python
+# numerical value using ``item()``:
+
+agg = tensor.sum()
+agg_item = agg.item()
+print(agg_item, type(agg_item))
+
+
+######################################################################
+# **In-place operations**
+# Operations that store the result into the operand are called in-place. They are denoted by a ``_`` suffix.
+# For example: ``x.copy_(y)``, ``x.t_()``, will change ``x``.
+
+print(f"{tensor} \n")
+tensor.add_(5)
+print(tensor)
+
+######################################################################
+# .. note::
+#      In-place operations save some memory, but can be problematic when computing derivatives because of an immediate loss
+#      of history. Hence, their use is discouraged.
+
+
+
+######################################################################
+# --------------
+#
+
+
+######################################################################
+# .. _bridge-to-np-label:
+#
+# Bridge with NumPy
+# ~~~~~~~~~~~~~~~~~
+# Tensors on the CPU and NumPy arrays can share their underlying memory
+# locations, and changing one will change	the other.
+
+
+######################################################################
+# Tensor to NumPy array
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+t = torch.ones(5)
+print(f"t: {t}")
+n = t.numpy()
+print(f"n: {n}")
+
+######################################################################
+# A change in the tensor reflects in the NumPy array.
+
+t.add_(1)
+print(f"t: {t}")
+print(f"n: {n}")
+
+
+######################################################################
+# NumPy array to Tensor
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+n = np.ones(5)
+t = torch.from_numpy(n)
+
+######################################################################
+# Changes in the NumPy array reflects in the tensor.
+np.add(n, 1, out=n)
+print(f"t: {t}")
+print(f"n: {n}")
diff --git a/beginner_source/basics/transforms_tutorial.py b/beginner_source/basics/transforms_tutorial.py
new file mode 100644
index 00000000000..33076958bf5
--- /dev/null
+++ b/beginner_source/basics/transforms_tutorial.py
@@ -0,0 +1,70 @@
+"""
+`Learn the Basics <intro.html>`_ ||
+`Quickstart <quickstart_tutorial.html>`_ ||
+`Tensors <tensorqs_tutorial.html>`_ ||
+`Datasets & DataLoaders <data_tutorial.html>`_ ||
+**Transforms** ||
+`Build Model <buildmodel_tutorial.html>`_ ||
+`Autograd <autogradqs_tutorial.html>`_ ||
+`Optimization <optimization_tutorial.html>`_ ||
+`Save & Load Model <saveloadrun_tutorial.html>`_
+
+Transforms
+===================
+
+Data does not always come in its final processed form that is required for
+training machine learning algorithms. We use **transforms** to perform some
+manipulation of the data and make it suitable for training.
+
+All TorchVision datasets have two parameters -``transform`` to modify the features and
+``target_transform`` to modify the labels - that accept callables containing the transformation logic.
+The `torchvision.transforms <https://pytorch.org/vision/stable/transforms.html>`_ module offers
+several commonly-used transforms out of the box.
+
+The FashionMNIST features are in PIL Image format, and the labels are integers.
+For training, we need the features as normalized tensors, and the labels as one-hot encoded tensors.
+To make these transformations, we use ``ToTensor`` and ``Lambda``.
+"""
+
+import torch
+from torchvision import datasets
+from torchvision.transforms import ToTensor, Lambda
+
+ds = datasets.FashionMNIST(
+    root="data",
+    train=True,
+    download=True,
+    transform=ToTensor(),
+    target_transform=Lambda(lambda y: torch.zeros(10, dtype=torch.float).scatter_(0, torch.tensor(y), value=1))
+)
+
+#################################################
+# ToTensor()
+# -------------------------------
+#
+# `ToTensor <https://pytorch.org/vision/stable/transforms.html#torchvision.transforms.ToTensor>`_
+# converts a PIL image or NumPy ``ndarray`` into a ``FloatTensor``. and scales
+# the image's pixel intensity values in the range [0., 1.]
+#
+
+##############################################
+# Lambda Transforms
+# -------------------------------
+#
+# Lambda transforms apply any user-defined lambda function. Here, we define a function
+# to turn the integer into a one-hot encoded tensor.
+# It first creates a zero tensor of size 10 (the number of labels in our dataset) and calls
+# `scatter_ <https://pytorch.org/docs/stable/generated/torch.Tensor.scatter_.html>`_ which assigns a
+# ``value=1`` on the index as given by the label ``y``.
+
+target_transform = Lambda(lambda y: torch.zeros(
+    10, dtype=torch.float).scatter_(dim=0, index=torch.tensor(y), value=1))
+
+######################################################################
+# --------------
+#
+
+#################################################################
+# Further Reading
+# ~~~~~~~~~~~~~~~~~
+# - `torchvision.transforms API <https://pytorch.org/vision/stable/transforms.html>`_
diff --git a/beginner_source/bettertransformer_tutorial.rst b/beginner_source/bettertransformer_tutorial.rst
new file mode 100644
index 00000000000..76aebd839a0
--- /dev/null
+++ b/beginner_source/bettertransformer_tutorial.rst
@@ -0,0 +1,10 @@
+Fast Transformer Inference with Better Transformer
+==================================================
+
+This tutorial has been deprecated.
+
+Redirecting in 3 seconds...
+
+.. raw:: html
+
+   <meta http-equiv="Refresh" content="3; url='https://pytorch.org/tutorials/'" />
diff --git a/beginner_source/blitz/README.txt b/beginner_source/blitz/README.txt
index 2a154a60b4c..f529d42e028 100644
--- a/beginner_source/blitz/README.txt
+++ b/beginner_source/blitz/README.txt
@@ -3,16 +3,21 @@ Deep Learning with PyTorch: A 60 Minute Blitz
 
 1. tensor_tutorial.py
 	What is PyTorch?
-	http://pytorch.org/tutorials/beginner/blitz/tensor_tutorial.html
+	https://pytorch.org/tutorials/beginner/blitz/tensor_tutorial.html
 
 2. autograd_tutorial.py
-	Autograd: automatic differentiation
-	http://pytorch.org/tutorials/beginner/blitz/autograd_tutorial.html
+	Autograd: Automatic Differentiation
+	https://pytorch.org/tutorials/beginner/blitz/autograd_tutorial.html
 
 3. neural_networks_tutorial.py
 	Neural Networks
-	http://pytorch.org/tutorials/beginner/blitz/neural_networks_tutorial.html#
+	https://pytorch.org/tutorials/beginner/blitz/neural_networks_tutorial.html#
 
 4. cifar10_tutorial.py
-	Training a classifier
-	http://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html
\ No newline at end of file
+	Training a Classifier
+	https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html
+
+5. data_parallel_tutorial.py 
+	Optional: Data Parallelism
+	https://pytorch.org/tutorials/beginner/blitz/data_parallel_tutorial.html
+
diff --git a/beginner_source/blitz/autograd_tutorial.py b/beginner_source/blitz/autograd_tutorial.py
index c8e22bdd39b..d4d0f96816e 100644
--- a/beginner_source/blitz/autograd_tutorial.py
+++ b/beginner_source/blitz/autograd_tutorial.py
@@ -1,121 +1,323 @@
 # -*- coding: utf-8 -*-
 """
-Autograd: automatic differentiation
-===================================
+A Gentle Introduction to ``torch.autograd``
+===========================================
 
-Central to all neural networks in PyTorch is the ``autograd`` package.
-Let’s first briefly visit this, and we will then go to training our
-first neural network.
+``torch.autograd`` is PyTorch’s automatic differentiation engine that powers
+neural network training. In this section, you will get a conceptual
+understanding of how autograd helps a neural network train.
 
+Background
+~~~~~~~~~~
+Neural networks (NNs) are a collection of nested functions that are
+executed on some input data. These functions are defined by *parameters*
+(consisting of weights and biases), which in PyTorch are stored in
+tensors.
 
-The ``autograd`` package provides automatic differentiation for all operations
-on Tensors. It is a define-by-run framework, which means that your backprop is
-defined by how your code is run, and that every single iteration can be
-different.
+Training a NN happens in two steps:
 
-Let us see this in more simple terms with some examples.
+**Forward Propagation**: In forward prop, the NN makes its best guess
+about the correct output. It runs the input data through each of its
+functions to make this guess.
 
-Variable
---------
+**Backward Propagation**: In backprop, the NN adjusts its parameters
+proportionate to the error in its guess. It does this by traversing
+backwards from the output, collecting the derivatives of the error with
+respect to the parameters of the functions (*gradients*), and optimizing
+the parameters using gradient descent. For a more detailed walkthrough
+of backprop, check out this `video from
+3Blue1Brown <https://www.youtube.com/watch?v=tIeHLnjs5U8>`__.
 
-``autograd.Variable`` is the central class of the package. It wraps a
-Tensor, and supports nearly all of operations defined on it. Once you
-finish your computation you can call ``.backward()`` and have all the
-gradients computed automatically.
 
-You can access the raw tensor through the ``.data`` attribute, while the
-gradient w.r.t. this variable is accumulated into ``.grad``.
 
-.. figure:: /_static/img/Variable.png
-   :alt: Variable
 
-   Variable
+Usage in PyTorch
+~~~~~~~~~~~~~~~~
+Let's take a look at a single training step.
+For this example, we load a pretrained resnet18 model from ``torchvision``.
+We create a random data tensor to represent a single image with 3 channels, and height & width of 64,
+and its corresponding ``label`` initialized to some random values. Label in pretrained models has
+shape (1,1000).
 
-There’s one more class which is very important for autograd
-implementation - a ``Function``.
+.. note::
+    This tutorial works only on the CPU and will not work on GPU devices (even if tensors are moved to CUDA).
 
-``Variable`` and ``Function`` are interconnected and build up an acyclic
-graph, that encodes a complete history of computation. Each variable has
-a ``.grad_fn`` attribute that references a ``Function`` that has created
-the ``Variable`` (except for Variables created by the user - their
-``grad_fn is None``).
-
-If you want to compute the derivatives, you can call ``.backward()`` on
-a ``Variable``. If ``Variable`` is a scalar (i.e. it holds a one element
-data), you don’t need to specify any arguments to ``backward()``,
-however if it has more elements, you need to specify a ``grad_output``
-argument that is a tensor of matching shape.
 """
+import torch
+from torchvision.models import resnet18, ResNet18_Weights
+model = resnet18(weights=ResNet18_Weights.DEFAULT)
+data = torch.rand(1, 3, 64, 64)
+labels = torch.rand(1, 1000)
+
+############################################################
+# Next, we run the input data through the model through each of its layers to make a prediction.
+# This is the **forward pass**.
+#
+
+prediction = model(data) # forward pass
+
+############################################################
+# We use the model's prediction and the corresponding label to calculate the error (``loss``).
+# The next step is to backpropagate this error through the network.
+# Backward propagation is kicked off when we call ``.backward()`` on the error tensor.
+# Autograd then calculates and stores the gradients for each model parameter in the parameter's ``.grad`` attribute.
+#
+
+loss = (prediction - labels).sum()
+loss.backward() # backward pass
+
+############################################################
+# Next, we load an optimizer, in this case SGD with a learning rate of 0.01 and `momentum <https://medium.com/data-science/stochastic-gradient-descent-with-momentum-a84097641a5d>`__ of 0.9.
+# We register all the parameters of the model in the optimizer.
+#
+
+optim = torch.optim.SGD(model.parameters(), lr=1e-2, momentum=0.9)
+
+######################################################################
+# Finally, we call ``.step()`` to initiate gradient descent. The optimizer adjusts each parameter by its gradient stored in ``.grad``.
+#
+
+optim.step() #gradient descent
+
+######################################################################
+# At this point, you have everything you need to train your neural network.
+# The below sections detail the workings of autograd - feel free to skip them.
+#
+
+
+######################################################################
+# --------------
+#
+
+
+######################################################################
+# Differentiation in Autograd
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# Let's take a look at how ``autograd`` collects gradients. We create two tensors ``a`` and ``b`` with
+# ``requires_grad=True``. This signals to ``autograd`` that every operation on them should be tracked.
+#
 
 import torch
-from torch.autograd import Variable
 
-###############################################################
-# Create a variable:
-x = Variable(torch.ones(2, 2), requires_grad=True)
-print(x)
+a = torch.tensor([2., 3.], requires_grad=True)
+b = torch.tensor([6., 4.], requires_grad=True)
+
+######################################################################
+# We create another tensor ``Q`` from ``a`` and ``b``.
+#
+# .. math::
+#    Q = 3a^3 - b^2
+
+Q = 3*a**3 - b**2
+
+
+######################################################################
+# Let's assume ``a`` and ``b`` to be parameters of an NN, and ``Q``
+# to be the error. In NN training, we want gradients of the error
+# w.r.t. parameters, i.e.
+#
+# .. math::
+#    \frac{\partial Q}{\partial a} = 9a^2
+#
+# .. math::
+#    \frac{\partial Q}{\partial b} = -2b
+#
+#
+# When we call ``.backward()`` on ``Q``, autograd calculates these gradients
+# and stores them in the respective tensors' ``.grad`` attribute.
+#
+# We need to explicitly pass a ``gradient`` argument in ``Q.backward()`` because it is a vector.
+# ``gradient`` is a tensor of the same shape as ``Q``, and it represents the
+# gradient of Q w.r.t. itself, i.e.
+#
+# .. math::
+#    \frac{dQ}{dQ} = 1
+#
+# Equivalently, we can also aggregate Q into a scalar and call backward implicitly, like ``Q.sum().backward()``.
+#
+external_grad = torch.tensor([1., 1.])
+Q.backward(gradient=external_grad)
+
+
+#######################################################################
+# Gradients are now deposited in ``a.grad`` and ``b.grad``
+
+# check if collected gradients are correct
+print(9*a**2 == a.grad)
+print(-2*b == b.grad)
+
+
+######################################################################
+# Optional Reading - Vector Calculus using ``autograd``
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+#
+# Mathematically, if you have a vector valued function
+# :math:`\vec{y}=f(\vec{x})`, then the gradient of :math:`\vec{y}` with
+# respect to :math:`\vec{x}` is a Jacobian matrix :math:`J`:
+#
+# .. math::
+#
+#
+#      J
+#      =
+#       \left(\begin{array}{cc}
+#       \frac{\partial \bf{y}}{\partial x_{1}} &
+#       ... &
+#       \frac{\partial \bf{y}}{\partial x_{n}}
+#       \end{array}\right)
+#      =
+#      \left(\begin{array}{ccc}
+#       \frac{\partial y_{1}}{\partial x_{1}} & \cdots & \frac{\partial y_{1}}{\partial x_{n}}\\
+#       \vdots & \ddots & \vdots\\
+#       \frac{\partial y_{m}}{\partial x_{1}} & \cdots & \frac{\partial y_{m}}{\partial x_{n}}
+#       \end{array}\right)
+#
+# Generally speaking, ``torch.autograd`` is an engine for computing
+# vector-Jacobian product. That is, given any vector :math:`\vec{v}`, compute the product
+# :math:`J^{T}\cdot \vec{v}`
+#
+# If :math:`\vec{v}` happens to be the gradient of a scalar function :math:`l=g\left(\vec{y}\right)`:
+#
+# .. math::
+#
+#
+#   \vec{v}
+#    =
+#    \left(\begin{array}{ccc}\frac{\partial l}{\partial y_{1}} & \cdots & \frac{\partial l}{\partial y_{m}}\end{array}\right)^{T}
+#
+# then by the chain rule, the vector-Jacobian product would be the
+# gradient of :math:`l` with respect to :math:`\vec{x}`:
+#
+# .. math::
+#
+#
+#      J^{T}\cdot \vec{v} = \left(\begin{array}{ccc}
+#       \frac{\partial y_{1}}{\partial x_{1}} & \cdots & \frac{\partial y_{m}}{\partial x_{1}}\\
+#       \vdots & \ddots & \vdots\\
+#       \frac{\partial y_{1}}{\partial x_{n}} & \cdots & \frac{\partial y_{m}}{\partial x_{n}}
+#       \end{array}\right)\left(\begin{array}{c}
+#       \frac{\partial l}{\partial y_{1}}\\
+#       \vdots\\
+#       \frac{\partial l}{\partial y_{m}}
+#       \end{array}\right) = \left(\begin{array}{c}
+#       \frac{\partial l}{\partial x_{1}}\\
+#       \vdots\\
+#       \frac{\partial l}{\partial x_{n}}
+#       \end{array}\right)
+#
+# This characteristic of vector-Jacobian product is what we use in the above example;
+# ``external_grad`` represents :math:`\vec{v}`.
 
-###############################################################
-# Do an operation of variable:
-y = x + 2
-print(y)
 
-###############################################################
-# ``y`` was created as a result of an operation, so it has a ``grad_fn``.
-print(y.grad_fn)
 
-###############################################################
-# Do more operations on y
-z = y * y * 3
-out = z.mean()
+######################################################################
+# Computational Graph
+# ~~~~~~~~~~~~~~~~~~~
+#
+# Conceptually, autograd keeps a record of data (tensors) & all executed
+# operations (along with the resulting new tensors) in a directed acyclic
+# graph (DAG) consisting of
+# `Function <https://pytorch.org/docs/stable/autograd.html#torch.autograd.Function>`__
+# objects. In this DAG, leaves are the input tensors, roots are the output
+# tensors. By tracing this graph from roots to leaves, you can
+# automatically compute the gradients using the chain rule.
+#
+# In a forward pass, autograd does two things simultaneously:
+#
+# - run the requested operation to compute a resulting tensor, and
+# - maintain the operation’s *gradient function* in the DAG.
+#
+# The backward pass kicks off when ``.backward()`` is called on the DAG
+# root. ``autograd`` then:
+#
+# - computes the gradients from each ``.grad_fn``,
+# - accumulates them in the respective tensor’s ``.grad`` attribute, and
+# - using the chain rule, propagates all the way to the leaf tensors.
+#
+# Below is a visual representation of the DAG in our example. In the graph,
+# the arrows are in the direction of the forward pass. The nodes represent the backward functions
+# of each operation in the forward pass. The leaf nodes in blue represent our leaf tensors ``a`` and ``b``.
+#
+# .. figure:: /_static/img/dag_autograd.png
+#
+# .. note::
+#   **DAGs are dynamic in PyTorch**
+#   An important thing to note is that the graph is recreated from scratch; after each
+#   ``.backward()`` call, autograd starts populating a new graph. This is
+#   exactly what allows you to use control flow statements in your model;
+#   you can change the shape, size and operations at every iteration if
+#   needed.
+#
+# Exclusion from the DAG
+# ^^^^^^^^^^^^^^^^^^^^^^
+#
+# ``torch.autograd`` tracks operations on all tensors which have their
+# ``requires_grad`` flag set to ``True``. For tensors that don’t require
+# gradients, setting this attribute to ``False`` excludes it from the
+# gradient computation DAG.
+#
+# The output tensor of an operation will require gradients even if only a
+# single input tensor has ``requires_grad=True``.
+#
 
-print(z, out)
+x = torch.rand(5, 5)
+y = torch.rand(5, 5)
+z = torch.rand((5, 5), requires_grad=True)
 
-###############################################################
-# Gradients
-# ---------
-# let's backprop now
-# ``out.backward()`` is equivalent to doing ``out.backward(torch.Tensor([1.0]))``
+a = x + y
+print(f"Does `a` require gradients?: {a.requires_grad}")
+b = x + z
+print(f"Does `b` require gradients?: {b.requires_grad}")
 
-out.backward()
 
-###############################################################
-# print gradients d(out)/dx
+######################################################################
+# In a NN, parameters that don't compute gradients are usually called **frozen parameters**.
+# It is useful to "freeze" part of your model if you know in advance that you won't need the gradients of those parameters
+# (this offers some performance benefits by reducing autograd computations).
 #
+# In finetuning, we freeze most of the model and typically only modify the classifier layers to make predictions on new labels.
+# Let's walk through a small example to demonstrate this. As before, we load a pretrained resnet18 model, and freeze all the parameters.
 
-print(x.grad)
+from torch import nn, optim
 
-###############################################################
-# You should have got a matrix of ``4.5``. Let’s call the ``out``
-# *Variable* “:math:`o`”.
-# We have that :math:`o = \frac{1}{4}\sum_i z_i`,
-# :math:`z_i = 3(x_i+2)^2` and :math:`z_i\bigr\rvert_{x_i=1} = 18`.
-# Therefore,
-# :math:`\frac{\partial o}{\partial x_i} = \frac{3}{2}(x_i+2)`, hence
-# :math:`\frac{\partial o}{\partial x_i}\bigr\rvert_{x_i=1} = \frac{9}{2} = 4.5`.
+model = resnet18(weights=ResNet18_Weights.DEFAULT)
 
-###############################################################
-# You can do many crazy things with autograd!
+# Freeze all the parameters in the network
+for param in model.parameters():
+    param.requires_grad = False
 
+######################################################################
+# Let's say we want to finetune the model on a new dataset with 10 labels.
+# In resnet, the classifier is the last linear layer ``model.fc``.
+# We can simply replace it with a new linear layer (unfrozen by default)
+# that acts as our classifier.
 
-x = torch.randn(3)
-x = Variable(x, requires_grad=True)
+model.fc = nn.Linear(512, 10)
 
-y = x * 2
-while y.data.norm() < 1000:
-    y = y * 2
+######################################################################
+# Now all parameters in the model, except the parameters of ``model.fc``, are frozen.
+# The only parameters that compute gradients are the weights and bias of ``model.fc``.
 
-print(y)
+# Optimize only the classifier
+optimizer = optim.SGD(model.parameters(), lr=1e-2, momentum=0.9)
 
-###############################################################
+##########################################################################
+# Notice although we register all the parameters in the optimizer,
+# the only parameters that are computing gradients (and hence updated in gradient descent)
+# are the weights and bias of the classifier.
+#
+# The same exclusionary functionality is available as a context manager in
+# `torch.no_grad() <https://pytorch.org/docs/stable/generated/torch.no_grad.html>`__
 #
-gradients = torch.FloatTensor([0.1, 1.0, 0.0001])
-y.backward(gradients)
 
-print(x.grad)
+######################################################################
+# --------------
+#
 
-###############################################################
-# **Read Later:**
+######################################################################
+# Further readings:
+# ~~~~~~~~~~~~~~~~~~~
 #
-# Documentation of ``Variable`` and ``Function`` is at
-# http://pytorch.org/docs/autograd
+# -  `In-place operations & Multithreaded Autograd <https://pytorch.org/docs/stable/notes/autograd.html>`__
+# -  `Example implementation of reverse-mode autodiff <https://colab.research.google.com/drive/1VpeE6UvEPRz9HmsHh1KS0XxXjYu533EC>`__
+# -  `Video: PyTorch Autograd Explained - In-depth Tutorial <https://www.youtube.com/watch?v=MswxJw-8PvE>`__
diff --git a/beginner_source/blitz/cifar10_tutorial.py b/beginner_source/blitz/cifar10_tutorial.py
index ac648b573de..8f19f5964c6 100644
--- a/beginner_source/blitz/cifar10_tutorial.py
+++ b/beginner_source/blitz/cifar10_tutorial.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 """
-Training a classifier
+Training a Classifier
 =====================
 
 This is it. You have seen how to define neural networks, compute loss and make
@@ -15,14 +15,14 @@
 you can use standard python packages that load data into a numpy array.
 Then you can convert this array into a ``torch.*Tensor``.
 
--  For images, packages such as Pillow, OpenCV are useful.
+-  For images, packages such as Pillow, OpenCV are useful
 -  For audio, packages such as scipy and librosa
 -  For text, either raw Python or Cython based loading, or NLTK and
-   SpaCy are useful.
+   SpaCy are useful
 
-Specifically for ``vision``, we have created a package called
+Specifically for vision, we have created a package called
 ``torchvision``, that has data loaders for common datasets such as
-Imagenet, CIFAR10, MNIST, etc. and data transformers for images, viz.,
+ImageNet, CIFAR10, MNIST, etc. and data transformers for images, viz.,
 ``torchvision.datasets`` and ``torch.utils.data.DataLoader``.
 
 This provides a huge convenience and avoids writing boilerplate code.
@@ -43,15 +43,15 @@
 
 We will do the following steps in order:
 
-1. Load and normalizing the CIFAR10 training and test datasets using
+1. Load and normalize the CIFAR10 training and test datasets using
    ``torchvision``
-2. Define a Convolution Neural Network
+2. Define a Convolutional Neural Network
 3. Define a loss function
 4. Train the network on the training data
 5. Test the network on the test data
 
-1. Loading and normalizing CIFAR10
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+1. Load and normalize CIFAR10
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Using ``torchvision``, it’s extremely easy to load CIFAR10.
 """
@@ -61,20 +61,28 @@
 
 ########################################################################
 # The output of torchvision datasets are PILImage images of range [0, 1].
-# We transform them to Tensors of normalized range [-1, 1]
+# We transform them to Tensors of normalized range [-1, 1].
+
+########################################################################
+# .. note::
+#     If you are running this tutorial on Windows or MacOS and encounter a
+#     BrokenPipeError or RuntimeError related to multiprocessing, try setting
+#     the num_worker of torch.utils.data.DataLoader() to 0.
 
 transform = transforms.Compose(
     [transforms.ToTensor(),
      transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
 
+batch_size = 4
+
 trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                         download=True, transform=transform)
-trainloader = torch.utils.data.DataLoader(trainset, batch_size=4,
+trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size,
                                           shuffle=True, num_workers=2)
 
 testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                        download=True, transform=transform)
-testloader = torch.utils.data.DataLoader(testset, batch_size=4,
+testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size,
                                          shuffle=False, num_workers=2)
 
 classes = ('plane', 'car', 'bird', 'cat',
@@ -93,32 +101,32 @@ def imshow(img):
     img = img / 2 + 0.5     # unnormalize
     npimg = img.numpy()
     plt.imshow(np.transpose(npimg, (1, 2, 0)))
+    plt.show()
 
 
 # get some random training images
 dataiter = iter(trainloader)
-images, labels = dataiter.next()
+images, labels = next(dataiter)
 
 # show images
 imshow(torchvision.utils.make_grid(images))
 # print labels
-print(' '.join('%5s' % classes[labels[j]] for j in range(4)))
+print(' '.join(f'{classes[labels[j]]:5s}' for j in range(batch_size)))
 
 
 ########################################################################
-# 2. Define a Convolution Neural Network
-# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+# 2. Define a Convolutional Neural Network
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 # Copy the neural network from the Neural Networks section before and modify it to
 # take 3-channel images (instead of 1-channel images as it was defined).
 
-from torch.autograd import Variable
 import torch.nn as nn
 import torch.nn.functional as F
 
 
 class Net(nn.Module):
     def __init__(self):
-        super(Net, self).__init__()
+        super().__init__()
         self.conv1 = nn.Conv2d(3, 6, 5)
         self.pool = nn.MaxPool2d(2, 2)
         self.conv2 = nn.Conv2d(6, 16, 5)
@@ -129,7 +137,7 @@ def __init__(self):
     def forward(self, x):
         x = self.pool(F.relu(self.conv1(x)))
         x = self.pool(F.relu(self.conv2(x)))
-        x = x.view(-1, 16 * 5 * 5)
+        x = torch.flatten(x, 1) # flatten all dimensions except batch
         x = F.relu(self.fc1(x))
         x = F.relu(self.fc2(x))
         x = self.fc3(x)
@@ -141,7 +149,7 @@ def forward(self, x):
 ########################################################################
 # 3. Define a Loss function and optimizer
 # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-# Let's use a Classification Cross-Entropy loss and SGD with momentum
+# Let's use a Classification Cross-Entropy loss and SGD with momentum.
 
 import torch.optim as optim
 
@@ -154,18 +162,15 @@ def forward(self, x):
 #
 # This is when things start to get interesting.
 # We simply have to loop over our data iterator, and feed the inputs to the
-# network and optimize
+# network and optimize.
 
 for epoch in range(2):  # loop over the dataset multiple times
 
     running_loss = 0.0
     for i, data in enumerate(trainloader, 0):
-        # get the inputs
+        # get the inputs; data is a list of [inputs, labels]
         inputs, labels = data
 
-        # wrap them in Variable
-        inputs, labels = Variable(inputs), Variable(labels)
-
         # zero the parameter gradients
         optimizer.zero_grad()
 
@@ -176,15 +181,23 @@ def forward(self, x):
         optimizer.step()
 
         # print statistics
-        running_loss += loss.data[0]
+        running_loss += loss.item()
         if i % 2000 == 1999:    # print every 2000 mini-batches
-            print('[%d, %5d] loss: %.3f' %
-                  (epoch + 1, i + 1, running_loss / 2000))
+            print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}')
             running_loss = 0.0
 
 print('Finished Training')
 
 ########################################################################
+# Let's quickly save our trained model:
+
+PATH = './cifar_net.pth'
+torch.save(net.state_dict(), PATH)
+
+########################################################################
+# See `here <https://pytorch.org/docs/stable/notes/serialization.html>`_
+# for more details on saving PyTorch models.
+#
 # 5. Test the network on the test data
 # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 #
@@ -198,25 +211,32 @@ def forward(self, x):
 # Okay, first step. Let us display an image from the test set to get familiar.
 
 dataiter = iter(testloader)
-images, labels = dataiter.next()
+images, labels = next(dataiter)
 
 # print images
 imshow(torchvision.utils.make_grid(images))
-print('GroundTruth: ', ' '.join('%5s' % classes[labels[j]] for j in range(4)))
+print('GroundTruth: ', ' '.join(f'{classes[labels[j]]:5s}' for j in range(4)))
+
+########################################################################
+# Next, let's load back in our saved model (note: saving and re-loading the model
+# wasn't necessary here, we only did it to illustrate how to do so):
+
+net = Net()
+net.load_state_dict(torch.load(PATH, weights_only=True))
 
 ########################################################################
 # Okay, now let us see what the neural network thinks these examples above are:
 
-outputs = net(Variable(images))
+outputs = net(images)
 
 ########################################################################
 # The outputs are energies for the 10 classes.
-# Higher the energy for a class, the more the network
+# The higher the energy for a class, the more the network
 # thinks that the image is of the particular class.
 # So, let's get the index of the highest energy:
-_, predicted = torch.max(outputs.data, 1)
+_, predicted = torch.max(outputs, 1)
 
-print('Predicted: ', ' '.join('%5s' % classes[predicted[j][0]]
+print('Predicted: ', ' '.join(f'{classes[predicted[j]]:5s}'
                               for j in range(4)))
 
 ########################################################################
@@ -226,40 +246,48 @@ def forward(self, x):
 
 correct = 0
 total = 0
-for data in testloader:
-    images, labels = data
-    outputs = net(Variable(images))
-    _, predicted = torch.max(outputs.data, 1)
-    total += labels.size(0)
-    correct += (predicted == labels).sum()
-
-print('Accuracy of the network on the 10000 test images: %d %%' % (
-    100 * correct / total))
+# since we're not training, we don't need to calculate the gradients for our outputs
+with torch.no_grad():
+    for data in testloader:
+        images, labels = data
+        # calculate outputs by running images through the network
+        outputs = net(images)
+        # the class with the highest energy is what we choose as prediction
+        _, predicted = torch.max(outputs, 1)
+        total += labels.size(0)
+        correct += (predicted == labels).sum().item()
+
+print(f'Accuracy of the network on the 10000 test images: {100 * correct // total} %')
 
 ########################################################################
-# That looks waaay better than chance, which is 10% accuracy (randomly picking
+# That looks way better than chance, which is 10% accuracy (randomly picking
 # a class out of 10 classes).
 # Seems like the network learnt something.
 #
 # Hmmm, what are the classes that performed well, and the classes that did
 # not perform well:
 
-class_correct = list(0. for i in range(10))
-class_total = list(0. for i in range(10))
-for data in testloader:
-    images, labels = data
-    outputs = net(Variable(images))
-    _, predicted = torch.max(outputs.data, 1)
-    c = (predicted == labels).squeeze()
-    for i in range(4):
-        label = labels[i]
-        class_correct[label] += c[i]
-        class_total[label] += 1
+# prepare to count predictions for each class
+correct_pred = {classname: 0 for classname in classes}
+total_pred = {classname: 0 for classname in classes}
+
+# again no gradients needed
+with torch.no_grad():
+    for data in testloader:
+        images, labels = data
+        outputs = net(images)
+        _, predictions = torch.max(outputs, 1)
+        # collect the correct predictions for each class
+        for label, prediction in zip(labels, predictions):
+            if label == prediction:
+                correct_pred[classes[label]] += 1
+            total_pred[classes[label]] += 1
 
 
-for i in range(10):
-    print('Accuracy of %5s : %2d %%' % (
-        classes[i], 100 * class_correct[i] / class_total[i]))
+# print accuracy for each class
+for classname, correct_count in correct_pred.items():
+    accuracy = 100 * float(correct_count) / total_pred[classname]
+    print(f'Accuracy for class: {classname:5s} is {accuracy:.1f} %')
 
 ########################################################################
 # Okay, so what next?
@@ -268,25 +296,38 @@ def forward(self, x):
 #
 # Training on GPU
 # ----------------
-# Just like how you transfer a Tensor on to the GPU, you transfer the neural
+# Just like how you transfer a Tensor onto the GPU, you transfer the neural
 # net onto the GPU.
-# This will recursively go over all modules and convert their parameters and
-# buffers to CUDA tensors:
+#
+# Let's first define our device as the first visible cuda device if we have
+# CUDA available:
+
+device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
+
+# Assuming that we are on a CUDA machine, this should print a CUDA device:
+
+print(device)
+
+########################################################################
+# The rest of this section assumes that ``device`` is a CUDA device.
+#
+# Then these methods will recursively go over all modules and convert their
+# parameters and buffers to CUDA tensors:
 #
 # .. code:: python
 #
-#     net.cuda()
+#     net.to(device)
 #
 #
 # Remember that you will have to send the inputs and targets at every step
 # to the GPU too:
 #
-# ::
+# .. code:: python
 #
-#         inputs, labels = Variable(inputs.cuda()), Variable(labels.cuda())
+#         inputs, labels = data[0].to(device), data[1].to(device)
 #
-# Why dont I notice MASSIVE speedup compared to CPU? Because your network
-# is realllly small.
+# Why don't I notice MASSIVE speedup compared to CPU? Because your network
+# is really small.
 #
 # **Exercise:** Try increasing the width of your network (argument 2 of
 # the first ``nn.Conv2d``, and argument 1 of the second ``nn.Conv2d`` –
@@ -297,23 +338,31 @@ def forward(self, x):
 # - Understanding PyTorch's Tensor library and neural networks at a high level.
 # - Train a small neural network to classify images
 #
+# Training on multiple GPUs
+# -------------------------
+# If you want to see even more MASSIVE speedup using all of your GPUs,
+# please check out :doc:`data_parallel_tutorial`.
+#
 # Where do I go next?
 # -------------------
 #
-# -  `Train neural nets to play video games`_
+# -  :doc:`Train neural nets to play video games </intermediate/reinforcement_q_learning>`
 # -  `Train a state-of-the-art ResNet network on imagenet`_
-# -  `Train an face generator using Generative Adversarial Networks`_
+# -  `Train a face generator using Generative Adversarial Networks`_
 # -  `Train a word-level language model using Recurrent LSTM networks`_
 # -  `More examples`_
 # -  `More tutorials`_
 # -  `Discuss PyTorch on the Forums`_
 # -  `Chat with other users on Slack`_
 #
-# .. _Train neural nets to play video games: https://goo.gl/uGOksc
 # .. _Train a state-of-the-art ResNet network on imagenet: https://github.com/pytorch/examples/tree/master/imagenet
-# .. _Train an face generator using Generative Adversarial Networks: https://github.com/pytorch/examples/tree/master/dcgan
+# .. _Train a face generator using Generative Adversarial Networks: https://github.com/pytorch/examples/tree/master/dcgan
 # .. _Train a word-level language model using Recurrent LSTM networks: https://github.com/pytorch/examples/tree/master/word_language_model
 # .. _More examples: https://github.com/pytorch/examples
 # .. _More tutorials: https://github.com/pytorch/tutorials
 # .. _Discuss PyTorch on the Forums: https://discuss.pytorch.org/
-# .. _Chat with other users on Slack: http://pytorch.slack.com/messages/beginner/
+# .. _Chat with other users on Slack: https://pytorch.slack.com/messages/beginner/
+
+# %%%%%%INVISIBLE_CODE_BLOCK%%%%%%
+del dataiter
+# %%%%%%INVISIBLE_CODE_BLOCK%%%%%%
diff --git a/beginner_source/blitz/data_parallel_tutorial.py b/beginner_source/blitz/data_parallel_tutorial.py
new file mode 100644
index 00000000000..eebca8ea52b
--- /dev/null
+++ b/beginner_source/blitz/data_parallel_tutorial.py
@@ -0,0 +1,255 @@
+"""
+Optional: Data Parallelism
+==========================
+**Authors**: `Sung Kim <https://github.com/hunkim>`_ and `Jenny Kang <https://github.com/jennykang>`_
+
+In this tutorial, we will learn how to use multiple GPUs using ``DataParallel``.
+
+It's very easy to use GPUs with PyTorch. You can put the model on a GPU:
+
+.. code:: python
+
+    device = torch.device("cuda:0")
+    model.to(device)
+
+Then, you can copy all your tensors to the GPU:
+
+.. code:: python
+
+    mytensor = my_tensor.to(device)
+
+Please note that just calling ``my_tensor.to(device)`` returns a new copy of
+``my_tensor`` on GPU instead of rewriting ``my_tensor``. You need to assign it to
+a new tensor and use that tensor on the GPU.
+
+It's natural to execute your forward, backward propagations on multiple GPUs.
+However, Pytorch will only use one GPU by default. You can easily run your
+operations on multiple GPUs by making your model run parallelly using
+``DataParallel``:
+
+.. code:: python
+
+    model = nn.DataParallel(model)
+
+That's the core behind this tutorial. We will explore it in more detail below.
+"""
+
+
+######################################################################
+# Imports and parameters
+# ----------------------
+#
+# Import PyTorch modules and define parameters.
+#
+
+import torch
+import torch.nn as nn
+from torch.utils.data import Dataset, DataLoader
+
+# Parameters and DataLoaders
+input_size = 5
+output_size = 2
+
+batch_size = 30
+data_size = 100
+
+
+######################################################################
+# Device
+#
+device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+
+######################################################################
+# Dummy DataSet
+# -------------
+#
+# Make a dummy (random) dataset. You just need to implement the
+# getitem
+#
+
+class RandomDataset(Dataset):
+
+    def __init__(self, size, length):
+        self.len = length
+        self.data = torch.randn(length, size)
+
+    def __getitem__(self, index):
+        return self.data[index]
+
+    def __len__(self):
+        return self.len
+
+rand_loader = DataLoader(dataset=RandomDataset(input_size, data_size),
+                         batch_size=batch_size, shuffle=True)
+
+
+######################################################################
+# Simple Model
+# ------------
+#
+# For the demo, our model just gets an input, performs a linear operation, and
+# gives an output. However, you can use ``DataParallel`` on any model (CNN, RNN,
+# Capsule Net etc.)
+#
+# We've placed a print statement inside the model to monitor the size of input
+# and output tensors.
+# Please pay attention to what is printed at batch rank 0.
+#
+
+class Model(nn.Module):
+    # Our model
+
+    def __init__(self, input_size, output_size):
+        super(Model, self).__init__()
+        self.fc = nn.Linear(input_size, output_size)
+
+    def forward(self, input):
+        output = self.fc(input)
+        print("\tIn Model: input size", input.size(),
+              "output size", output.size())
+
+        return output
+
+
+######################################################################
+# Create Model and DataParallel
+# -----------------------------
+#
+# This is the core part of the tutorial. First, we need to make a model instance
+# and check if we have multiple GPUs. If we have multiple GPUs, we can wrap
+# our model using ``nn.DataParallel``. Then we can put our model on GPUs by
+# ``model.to(device)``
+#
+
+model = Model(input_size, output_size)
+if torch.cuda.device_count() > 1:
+  print("Let's use", torch.cuda.device_count(), "GPUs!")
+  # dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs
+  model = nn.DataParallel(model)
+
+model.to(device)
+
+
+######################################################################
+# Run the Model
+# -------------
+#
+# Now we can see the sizes of input and output tensors.
+#
+
+for data in rand_loader:
+    input = data.to(device)
+    output = model(input)
+    print("Outside: input size", input.size(),
+          "output_size", output.size())
+
+
+######################################################################
+# Results
+# -------
+#
+# If you have no GPU or one GPU, when we batch 30 inputs and 30 outputs, the model gets 30 and outputs 30 as
+# expected. But if you have multiple GPUs, then you can get results like this.
+#
+# 2 GPUs
+# ~~~~~~
+#
+# If you have 2, you will see:
+#
+# .. code:: bash
+#
+#     # on 2 GPUs
+#     Let's use 2 GPUs!
+#         In Model: input size torch.Size([15, 5]) output size torch.Size([15, 2])
+#         In Model: input size torch.Size([15, 5]) output size torch.Size([15, 2])
+#     Outside: input size torch.Size([30, 5]) output_size torch.Size([30, 2])
+#         In Model: input size torch.Size([15, 5]) output size torch.Size([15, 2])
+#         In Model: input size torch.Size([15, 5]) output size torch.Size([15, 2])
+#     Outside: input size torch.Size([30, 5]) output_size torch.Size([30, 2])
+#         In Model: input size torch.Size([15, 5]) output size torch.Size([15, 2])
+#         In Model: input size torch.Size([15, 5]) output size torch.Size([15, 2])
+#     Outside: input size torch.Size([30, 5]) output_size torch.Size([30, 2])
+#         In Model: input size torch.Size([5, 5]) output size torch.Size([5, 2])
+#         In Model: input size torch.Size([5, 5]) output size torch.Size([5, 2])
+#     Outside: input size torch.Size([10, 5]) output_size torch.Size([10, 2])
+#
+# 3 GPUs
+# ~~~~~~
+#
+# If you have 3 GPUs, you will see:
+#
+# .. code:: bash
+#
+#     Let's use 3 GPUs!
+#         In Model: input size torch.Size([10, 5]) output size torch.Size([10, 2])
+#         In Model: input size torch.Size([10, 5]) output size torch.Size([10, 2])
+#         In Model: input size torch.Size([10, 5]) output size torch.Size([10, 2])
+#     Outside: input size torch.Size([30, 5]) output_size torch.Size([30, 2])
+#         In Model: input size torch.Size([10, 5]) output size torch.Size([10, 2])
+#         In Model: input size torch.Size([10, 5]) output size torch.Size([10, 2])
+#         In Model: input size torch.Size([10, 5]) output size torch.Size([10, 2])
+#     Outside: input size torch.Size([30, 5]) output_size torch.Size([30, 2])
+#         In Model: input size torch.Size([10, 5]) output size torch.Size([10, 2])
+#         In Model: input size torch.Size([10, 5]) output size torch.Size([10, 2])
+#         In Model: input size torch.Size([10, 5]) output size torch.Size([10, 2])
+#     Outside: input size torch.Size([30, 5]) output_size torch.Size([30, 2])
+#         In Model: input size torch.Size([4, 5]) output size torch.Size([4, 2])
+#         In Model: input size torch.Size([4, 5]) output size torch.Size([4, 2])
+#         In Model: input size torch.Size([2, 5]) output size torch.Size([2, 2])
+#     Outside: input size torch.Size([10, 5]) output_size torch.Size([10, 2])
+#
+# 8 GPUs
+# ~~~~~~~~~~~~~~
+#
+# If you have 8, you will see:
+#
+# .. code:: bash
+#
+#     Let's use 8 GPUs!
+#         In Model: input size torch.Size([4, 5]) output size torch.Size([4, 2])
+#         In Model: input size torch.Size([4, 5]) output size torch.Size([4, 2])
+#         In Model: input size torch.Size([2, 5]) output size torch.Size([2, 2])
+#         In Model: input size torch.Size([4, 5]) output size torch.Size([4, 2])
+#         In Model: input size torch.Size([4, 5]) output size torch.Size([4, 2])
+#         In Model: input size torch.Size([4, 5]) output size torch.Size([4, 2])
+#         In Model: input size torch.Size([4, 5]) output size torch.Size([4, 2])
+#         In Model: input size torch.Size([4, 5]) output size torch.Size([4, 2])
+#     Outside: input size torch.Size([30, 5]) output_size torch.Size([30, 2])
+#         In Model: input size torch.Size([4, 5]) output size torch.Size([4, 2])
+#         In Model: input size torch.Size([4, 5]) output size torch.Size([4, 2])
+#         In Model: input size torch.Size([4, 5]) output size torch.Size([4, 2])
+#         In Model: input size torch.Size([4, 5]) output size torch.Size([4, 2])
+#         In Model: input size torch.Size([4, 5]) output size torch.Size([4, 2])
+#         In Model: input size torch.Size([4, 5]) output size torch.Size([4, 2])
+#         In Model: input size torch.Size([2, 5]) output size torch.Size([2, 2])
+#         In Model: input size torch.Size([4, 5]) output size torch.Size([4, 2])
+#     Outside: input size torch.Size([30, 5]) output_size torch.Size([30, 2])
+#         In Model: input size torch.Size([4, 5]) output size torch.Size([4, 2])
+#         In Model: input size torch.Size([4, 5]) output size torch.Size([4, 2])
+#         In Model: input size torch.Size([4, 5]) output size torch.Size([4, 2])
+#         In Model: input size torch.Size([4, 5]) output size torch.Size([4, 2])
+#         In Model: input size torch.Size([4, 5]) output size torch.Size([4, 2])
+#         In Model: input size torch.Size([4, 5]) output size torch.Size([4, 2])
+#         In Model: input size torch.Size([4, 5]) output size torch.Size([4, 2])
+#         In Model: input size torch.Size([2, 5]) output size torch.Size([2, 2])
+#     Outside: input size torch.Size([30, 5]) output_size torch.Size([30, 2])
+#         In Model: input size torch.Size([2, 5]) output size torch.Size([2, 2])
+#         In Model: input size torch.Size([2, 5]) output size torch.Size([2, 2])
+#         In Model: input size torch.Size([2, 5]) output size torch.Size([2, 2])
+#         In Model: input size torch.Size([2, 5]) output size torch.Size([2, 2])
+#         In Model: input size torch.Size([2, 5]) output size torch.Size([2, 2])
+#     Outside: input size torch.Size([10, 5]) output_size torch.Size([10, 2])
+#
+
+
+######################################################################
+# Summary
+# -------
+#
+# DataParallel splits your data automatically and sends job orders to multiple
+# models on several GPUs. After each model finishes their job, DataParallel
+# collects and merges the results before returning it to you.
+#
+# For more information, please check out
+# https://pytorch.org/tutorials/beginner/former\_torchies/parallelism\_tutorial.html.
+#
diff --git a/beginner_source/blitz/neural_networks_tutorial.py b/beginner_source/blitz/neural_networks_tutorial.py
index 753447446e3..9c04d9af0f3 100644
--- a/beginner_source/blitz/neural_networks_tutorial.py
+++ b/beginner_source/blitz/neural_networks_tutorial.py
@@ -7,10 +7,10 @@
 
 Now that you had a glimpse of ``autograd``, ``nn`` depends on
 ``autograd`` to define models and differentiate them.
-An ``nn.Module`` contains layers, and a method ``forward(input)``\ that
+An ``nn.Module`` contains layers, and a method ``forward(input)`` that
 returns the ``output``.
 
-For example, look at this network that classfies digit images:
+For example, look at this network that classifies digit images:
 
 .. figure:: /_static/img/mnist.png
    :alt: convnet
@@ -30,7 +30,7 @@
 - Compute the loss (how far is the output from being correct)
 - Propagate gradients back into the network’s parameters
 - Update the weights of the network, typically using a simple update rule:
-  ``weight = weight + learning_rate * gradient``
+  ``weight = weight - learning_rate * gradient``
 
 Define the network
 ------------------
@@ -38,7 +38,6 @@
 Let’s define this network:
 """
 import torch
-from torch.autograd import Variable
 import torch.nn as nn
 import torch.nn.functional as F
 
@@ -52,27 +51,37 @@ def __init__(self):
         self.conv1 = nn.Conv2d(1, 6, 5)
         self.conv2 = nn.Conv2d(6, 16, 5)
         # an affine operation: y = Wx + b
-        self.fc1 = nn.Linear(16 * 5 * 5, 120)
+        self.fc1 = nn.Linear(16 * 5 * 5, 120)  # 5*5 from image dimension 
         self.fc2 = nn.Linear(120, 84)
         self.fc3 = nn.Linear(84, 10)
 
-    def forward(self, x):
-        # Max pooling over a (2, 2) window
-        x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
-        # If the size is a square you can only specify a single number
-        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
-        x = x.view(-1, self.num_flat_features(x))
-        x = F.relu(self.fc1(x))
-        x = F.relu(self.fc2(x))
-        x = self.fc3(x)
-        return x
-
-    def num_flat_features(self, x):
-        size = x.size()[1:]  # all dimensions except the batch dimension
-        num_features = 1
-        for s in size:
-            num_features *= s
-        return num_features
+    def forward(self, input):
+        # Convolution layer C1: 1 input image channel, 6 output channels,
+        # 5x5 square convolution, it uses RELU activation function, and
+        # outputs a Tensor with size (N, 6, 28, 28), where N is the size of the batch
+        c1 = F.relu(self.conv1(input))
+        # Subsampling layer S2: 2x2 grid, purely functional,
+        # this layer does not have any parameter, and outputs a (N, 6, 14, 14) Tensor
+        s2 = F.max_pool2d(c1, (2, 2))
+        # Convolution layer C3: 6 input channels, 16 output channels,
+        # 5x5 square convolution, it uses RELU activation function, and
+        # outputs a (N, 16, 10, 10) Tensor
+        c3 = F.relu(self.conv2(s2))
+        # Subsampling layer S4: 2x2 grid, purely functional,
+        # this layer does not have any parameter, and outputs a (N, 16, 5, 5) Tensor
+        s4 = F.max_pool2d(c3, 2)
+        # Flatten operation: purely functional, outputs a (N, 400) Tensor
+        s4 = torch.flatten(s4, 1)
+        # Fully connected layer F5: (N, 400) Tensor input,
+        # and outputs a (N, 120) Tensor, it uses RELU activation function
+        f5 = F.relu(self.fc1(s4))
+        # Fully connected layer F6: (N, 120) Tensor input,
+        # and outputs a (N, 84) Tensor, it uses RELU activation function
+        f6 = F.relu(self.fc2(f5))
+        # Gaussian layer OUTPUT: (N, 84) Tensor input, and
+        # outputs a (N, 10) Tensor
+        output = self.fc3(f6)
+        return output
 
 
 net = Net()
@@ -91,9 +100,11 @@ def num_flat_features(self, x):
 print(params[0].size())  # conv1's .weight
 
 ########################################################################
-# The input to the forward is an ``autograd.Variable``, and so is the output.
+# Let's try a random 32x32 input.
+# Note: expected input size of this net (LeNet) is 32x32. To use this net on
+# the MNIST dataset, please resize the images from the dataset to 32x32.
 
-input = Variable(torch.randn(1, 1, 32, 32))
+input = torch.randn(1, 1, 32, 32)
 out = net(input)
 print(out)
 
@@ -106,7 +117,7 @@ def num_flat_features(self, x):
 ########################################################################
 # .. note::
 #
-#     ``torch.nn`` only supports mini-batches The entire ``torch.nn``
+#     ``torch.nn`` only supports mini-batches. The entire ``torch.nn``
 #     package only supports inputs that are a mini-batch of samples, and not
 #     a single sample.
 #
@@ -119,25 +130,23 @@ def num_flat_features(self, x):
 # Before proceeding further, let's recap all the classes you’ve seen so far.
 #
 # **Recap:**
-#   -  ``torch.Tensor`` - A *multi-dimensional array*.
-#   -  ``autograd.Variable`` - *Wraps a Tensor and records the history of
-#      operations* applied to it. Has the same API as a ``Tensor``, with
-#      some additions like ``backward()``. Also *holds the gradient*
-#      w.r.t. the tensor.
+#   -  ``torch.Tensor`` - A *multi-dimensional array* with support for autograd
+#      operations like ``backward()``. Also *holds the gradient* w.r.t. the
+#      tensor.
 #   -  ``nn.Module`` - Neural network module. *Convenient way of
 #      encapsulating parameters*, with helpers for moving them to GPU,
 #      exporting, loading, etc.
-#   -  ``nn.Parameter`` - A kind of Variable, that is *automatically
+#   -  ``nn.Parameter`` - A kind of Tensor, that is *automatically
 #      registered as a parameter when assigned as an attribute to a*
 #      ``Module``.
 #   -  ``autograd.Function`` - Implements *forward and backward definitions
-#      of an autograd operation*. Every ``Variable`` operation, creates at
-#      least a single ``Function`` node, that connects to functions that
-#      created a ``Variable`` and *encodes its history**.
+#      of an autograd operation*. Every ``Tensor`` operation creates at
+#      least a single ``Function`` node that connects to functions that
+#      created a ``Tensor`` and *encodes its history*.
 #
 # **At this point, we covered:**
 #   -  Defining a neural network
-#   -  Processing inputs and calling backward.
+#   -  Processing inputs and calling backward
 #
 # **Still Left:**
 #   -  Computing the loss
@@ -149,35 +158,37 @@ def num_flat_features(self, x):
 # value that estimates how far away the output is from the target.
 #
 # There are several different
-# `loss functions <http://pytorch.org/docs/nn.html#loss-functions>`_ under the
+# `loss functions <https://pytorch.org/docs/nn.html#loss-functions>`_ under the
 # nn package .
 # A simple loss is: ``nn.MSELoss`` which computes the mean-squared error
-# between the input and the target.
+# between the output and the target.
 #
 # For example:
 
 output = net(input)
-target = Variable(torch.arange(1, 11))  # a dummy target, for example
+target = torch.randn(10)  # a dummy target, for example
+target = target.view(1, -1)  # make it the same shape as output
 criterion = nn.MSELoss()
 
 loss = criterion(output, target)
 print(loss)
 
 ########################################################################
-# Now, if you follow ``loss`` in the backward direction, using it’s
+# Now, if you follow ``loss`` in the backward direction, using its
 # ``.grad_fn`` attribute, you will see a graph of computations that looks
 # like this:
 #
-# ::
+# .. code-block:: sh
 #
 #     input -> conv2d -> relu -> maxpool2d -> conv2d -> relu -> maxpool2d
-#           -> view -> linear -> relu -> linear -> relu -> linear
+#           -> flatten -> linear -> relu -> linear -> relu -> linear
 #           -> MSELoss
 #           -> loss
 #
 # So, when we call ``loss.backward()``, the whole graph is differentiated
-# w.r.t. the loss, and all Variables in the graph will have their
-# ``.grad`` Variable accumulated with the gradient.
+# w.r.t. the neural net parameters, and all Tensors in the graph that have
+# ``requires_grad=True`` will have their ``.grad`` Tensor accumulated with the
+# gradient.
 #
 # For illustration, let us follow a few steps backward:
 
@@ -188,9 +199,9 @@ def num_flat_features(self, x):
 ########################################################################
 # Backprop
 # --------
-# To backpropogate the error all we have to do is to ``loss.backward()``.
+# To backpropagate the error all we have to do is to ``loss.backward()``.
 # You need to clear the existing gradients though, else gradients will be
-# accumulated to existing gradients
+# accumulated to existing gradients.
 #
 #
 # Now we shall call ``loss.backward()``, and have a look at conv1's bias
@@ -214,20 +225,22 @@ def num_flat_features(self, x):
 #
 #   The neural network package contains various modules and loss functions
 #   that form the building blocks of deep neural networks. A full list with
-#   documentation is `here <http://pytorch.org/docs/nn>`_
+#   documentation is `here <https://pytorch.org/docs/nn>`_.
 #
 # **The only thing left to learn is:**
 #
-#   - updating the weights of the network
+#   - Updating the weights of the network
 #
 # Update the weights
 # ------------------
 # The simplest update rule used in practice is the Stochastic Gradient
 # Descent (SGD):
 #
-#      ``weight = weight - learning_rate * gradient``
+# .. code:: python
+#
+#     weight = weight - learning_rate * gradient
 #
-# We can implement this using simple python code:
+# We can implement this using simple Python code:
 #
 # .. code:: python
 #
@@ -239,15 +252,26 @@ def num_flat_features(self, x):
 # update rules such as SGD, Nesterov-SGD, Adam, RMSProp, etc.
 # To enable this, we built a small package: ``torch.optim`` that
 # implements all these methods. Using it is very simple:
+#
+# .. code:: python
+#
+#     import torch.optim as optim
+#
+#     # create your optimizer
+#     optimizer = optim.SGD(net.parameters(), lr=0.01)
+#
+#     # in your training loop:
+#     optimizer.zero_grad()   # zero the gradient buffers
+#     output = net(input)
+#     loss = criterion(output, target)
+#     loss.backward()
+#     optimizer.step()    # Does the update
+#
 
-import torch.optim as optim
-
-# create your optimizer
-optimizer = optim.SGD(net.parameters(), lr=0.01)
 
-# in your training loop:
-optimizer.zero_grad()   # zero the gradient buffers
-output = net(input)
-loss = criterion(output, target)
-loss.backward()
-optimizer.step()    # Does the update
+###############################################################
+# .. note::
+#
+#       Observe how gradient buffers had to be manually set to zero using
+#       ``optimizer.zero_grad()``. This is because gradients are accumulated
+#       as explained in the `Backprop`_ section.
diff --git a/beginner_source/blitz/tensor_tutorial.py b/beginner_source/blitz/tensor_tutorial.py
index c78ea378723..ac54945bc3a 100644
--- a/beginner_source/blitz/tensor_tutorial.py
+++ b/beginner_source/blitz/tensor_tutorial.py
@@ -1,143 +1,201 @@
-# -*- coding: utf-8 -*-
 """
-What is PyTorch?
-================
-
-It’s a Python based scientific computing package targeted at two sets of
-audiences:
-
--  A replacement for numpy to use the power of GPUs
--  a deep learning research platform that provides maximum flexibility
-   and speed
+Tensors
+========
 
-Getting Started
----------------
+Tensors are a specialized data structure that are very similar to arrays
+and matrices. In PyTorch, we use tensors to encode the inputs and
+outputs of a model, as well as the model’s parameters.
 
-Tensors
-^^^^^^^
+Tensors are similar to NumPy’s ndarrays, except that tensors can run on
+GPUs or other specialized hardware to accelerate computing. If you’re familiar with ndarrays, you’ll
+be right at home with the Tensor API. If not, follow along in this quick
+API walkthrough.
 
-Tensors are similar to numpy’s ndarrays, with the addition being that
-Tensors can also be used on a GPU to accelerate computing.
 """
 
-from __future__ import print_function
 import torch
+import numpy as np
 
-###############################################################
-# Construct a 5x3 matrix, uninitialized:
-
-x = torch.Tensor(5, 3)
-print(x)
 
-###############################################################
-# Construct a randomly initialized matrix
+######################################################################
+# Tensor Initialization
+# ~~~~~~~~~~~~~~~~~~~~~
+#
+# Tensors can be initialized in various ways. Take a look at the following examples:
+#
+# **Directly from data**
+#
+# Tensors can be created directly from data. The data type is automatically inferred.
 
-x = torch.rand(5, 3)
-print(x)
+data = [[1, 2], [3, 4]]
+x_data = torch.tensor(data)
 
-###############################################################
-# Get its size
+######################################################################
+# **From a NumPy array**
+#
+# Tensors can be created from NumPy arrays (and vice versa - see :ref:`bridge-to-np-label`).
+np_array = np.array(data)
+x_np = torch.from_numpy(np_array)
 
-print(x.size())
 
 ###############################################################
-# .. note::
-#     ``torch.Size`` is in fact a tuple, so it supports the same operations
+# **From another tensor:**
 #
-# Operations
-# ^^^^^^^^^^
-# There are multiple syntaxes for operations. Let's see addition as an example
+# The new tensor retains the properties (shape, datatype) of the argument tensor, unless explicitly overridden.
+
+x_ones = torch.ones_like(x_data) # retains the properties of x_data
+print(f"Ones Tensor: \n {x_ones} \n")
+
+x_rand = torch.rand_like(x_data, dtype=torch.float) # overrides the datatype of x_data
+print(f"Random Tensor: \n {x_rand} \n")
+
+
+######################################################################
+# **With random or constant values:**
 #
-# Addition: syntax 1
-y = torch.rand(5, 3)
-print(x + y)
+# ``shape`` is a tuple of tensor dimensions. In the functions below, it determines the dimensionality of the output tensor.
 
-###############################################################
-# Addition: syntax 2
+shape = (2, 3,)
+rand_tensor = torch.rand(shape)
+ones_tensor = torch.ones(shape)
+zeros_tensor = torch.zeros(shape)
 
-print(torch.add(x, y))
+print(f"Random Tensor: \n {rand_tensor} \n")
+print(f"Ones Tensor: \n {ones_tensor} \n")
+print(f"Zeros Tensor: \n {zeros_tensor}")
 
-###############################################################
-# Addition: giving an output tensor
-result = torch.Tensor(5, 3)
-torch.add(x, y, out=result)
-print(result)
 
-###############################################################
-# Addition: in-place
 
-# adds x to y
-y.add_(x)
-print(y)
 
-###############################################################
-# .. note::
-#     Any operation that mutates a tensor in-place is post-fixed with an ``_``
-#     For example: ``x.copy_(y)``, ``x.t_()``, will change ``x``.
+######################################################################
+# --------------
 #
-# You can use standard numpy-like indexing with all bells and whistles!
 
-print(x[:, 1])
 
-###############################################################
-# **Read later:**
+######################################################################
+# Tensor Attributes
+# ~~~~~~~~~~~~~~~~~
 #
+# Tensor attributes describe their shape, datatype, and the device on which they are stored.
+
+tensor = torch.rand(3, 4)
+
+print(f"Shape of tensor: {tensor.shape}")
+print(f"Datatype of tensor: {tensor.dtype}")
+print(f"Device tensor is stored on: {tensor.device}")
+
+
+######################################################################
+# --------------
 #
-#   100+ Tensor operations, including transposing, indexing, slicing,
-#   mathematical operations, linear algebra, random numbers, etc are described
-#   `here <http://pytorch.org/docs/torch>`_
-#
-# Numpy Bridge
-# ------------
+
+
+######################################################################
+# Tensor Operations
+# ~~~~~~~~~~~~~~~~~
 #
-# Converting a torch Tensor to a numpy array and vice versa is a breeze.
+# Over 100 tensor operations, including transposing, indexing, slicing,
+# mathematical operations, linear algebra, random sampling, and more are
+# comprehensively described
+# `here <https://pytorch.org/docs/stable/torch.html>`__.
 #
-# The torch Tensor and numpy array will share their underlying memory
-# locations, and changing one will change the other.
+# Each of them can be run on the GPU (at typically higher speeds than on a
+# CPU). If you’re using Colab, allocate a GPU by going to Edit > Notebook
+# Settings.
 #
-# Converting torch Tensor to numpy Array
-# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-a = torch.ones(5)
-print(a)
+# We move our tensor to the GPU if available
+if torch.cuda.is_available():
+  tensor = tensor.to('cuda')
+  print(f"Device tensor is stored on: {tensor.device}")
 
-###############################################################
+
+######################################################################
+# Try out some of the operations from the list.
+# If you're familiar with the NumPy API, you'll find the Tensor API a breeze to use.
 #
 
+###############################################################
+# **Standard numpy-like indexing and slicing:**
+
+tensor = torch.ones(4, 4)
+tensor[:,1] = 0
+print(tensor)
 
-b = a.numpy()
-print(b)
+######################################################################
+# **Joining tensors** You can use ``torch.cat`` to concatenate a sequence of tensors along a given dimension.
+# See also `torch.stack <https://pytorch.org/docs/stable/generated/torch.stack.html>`__,
+# another tensor joining op that is subtly different from ``torch.cat``.
+t1 = torch.cat([tensor, tensor, tensor], dim=1)
+print(t1)
 
-###############################################################
-# See how the numpy array changed in value.
+######################################################################
+# **Multiplying tensors**
 
-a.add_(1)
-print(a)
-print(b)
+# This computes the element-wise product
+print(f"tensor.mul(tensor) \n {tensor.mul(tensor)} \n")
+# Alternative syntax:
+print(f"tensor * tensor \n {tensor * tensor}")
 
-###############################################################
-# Converting numpy Array to torch Tensor
-# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-# See how changing the np array changed the torch Tensor automatically
+######################################################################
+#
+# This computes the matrix multiplication between two tensors
+print(f"tensor.matmul(tensor.T) \n {tensor.matmul(tensor.T)} \n")
+# Alternative syntax:
+print(f"tensor @ tensor.T \n {tensor @ tensor.T}")
 
-import numpy as np
-a = np.ones(5)
-b = torch.from_numpy(a)
-np.add(a, 1, out=a)
-print(a)
-print(b)
 
-###############################################################
-# All the Tensors on the CPU except a CharTensor support converting to
-# NumPy and back.
+######################################################################
+# **In-place operations**
+# Operations that have a ``_`` suffix are in-place. For example: ``x.copy_(y)``, ``x.t_()``, will change ``x``.
+
+print(tensor, "\n")
+tensor.add_(5)
+print(tensor)
+
+######################################################################
+# .. note::
+#      In-place operations save some memory, but can be problematic when computing derivatives because of an immediate loss
+#      of history. Hence, their use is discouraged.
+
+######################################################################
+# --------------
 #
-# CUDA Tensors
-# ------------
+
+
+######################################################################
+# .. _bridge-to-np-label:
 #
-# Tensors can be moved onto GPU using the ``.cuda`` function.
+# Bridge with NumPy
+# ~~~~~~~~~~~~~~~~~
+# Tensors on the CPU and NumPy arrays can share their underlying memory
+# locations, and changing one will change	the other.
 
-# let us run this cell only if CUDA is available
-if torch.cuda.is_available():
-    x = x.cuda()
-    y = y.cuda()
-    x + y
+
+######################################################################
+# Tensor to NumPy array
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+t = torch.ones(5)
+print(f"t: {t}")
+n = t.numpy()
+print(f"n: {n}")
+
+######################################################################
+# A change in the tensor reflects in the NumPy array.
+
+t.add_(1)
+print(f"t: {t}")
+print(f"n: {n}")
+
+
+######################################################################
+# NumPy array to Tensor
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+n = np.ones(5)
+t = torch.from_numpy(n)
+
+######################################################################
+# Changes in the NumPy array reflects in the tensor.
+np.add(n, 1, out=n)
+print(f"t: {t}")
+print(f"n: {n}")
diff --git a/beginner_source/chatbot_tutorial.py b/beginner_source/chatbot_tutorial.py
new file mode 100644
index 00000000000..520c934d965
--- /dev/null
+++ b/beginner_source/chatbot_tutorial.py
@@ -0,0 +1,1371 @@
+# -*- coding: utf-8 -*-
+
+"""
+Chatbot Tutorial
+================
+**Author:** `Matthew Inkawhich <https://github.com/MatthewInkawhich>`_
+"""
+
+
+######################################################################
+# In this tutorial, we explore a fun and interesting use-case of recurrent
+# sequence-to-sequence models. We will train a simple chatbot using movie
+# scripts from the `Cornell Movie-Dialogs
+# Corpus <https://www.cs.cornell.edu/~cristian/Cornell_Movie-Dialogs_Corpus.html>`__.
+#
+# Conversational models are a hot topic in artificial intelligence
+# research. Chatbots can be found in a variety of settings, including
+# customer service applications and online helpdesks. These bots are often
+# powered by retrieval-based models, which output predefined responses to
+# questions of certain forms. In a highly restricted domain like a
+# company’s IT helpdesk, these models may be sufficient, however, they are
+# not robust enough for more general use-cases. Teaching a machine to
+# carry out a meaningful conversation with a human in multiple domains is
+# a research question that is far from solved. Recently, the deep learning
+# boom has allowed for powerful generative models like Google’s `Neural
+# Conversational Model <https://arxiv.org/abs/1506.05869>`__, which marks
+# a large step towards multi-domain generative conversational models. In
+# this tutorial, we will implement this kind of model in PyTorch.
+#
+# .. figure:: /_static/img/chatbot/bot.png
+#    :align: center
+#    :alt: bot
+#
+# .. code-block:: python
+#
+#    > hello?
+#    Bot: hello .
+#    > where am I?
+#    Bot: you re in a hospital .
+#    > who are you?
+#    Bot: i m a lawyer .
+#    > how are you doing?
+#    Bot: i m fine .
+#    > are you my friend?
+#    Bot: no .
+#    > you're under arrest
+#    Bot: i m trying to help you !
+#    > i'm just kidding
+#    Bot: i m sorry .
+#    > where are you from?
+#    Bot: san francisco .
+#    > it's time for me to leave
+#    Bot: i know .
+#    > goodbye
+#    Bot: goodbye .
+#
+# **Tutorial Highlights**
+#
+# -  Handle loading and preprocessing of `Cornell Movie-Dialogs
+#    Corpus <https://www.cs.cornell.edu/~cristian/Cornell_Movie-Dialogs_Corpus.html>`__
+#    dataset
+# -  Implement a sequence-to-sequence model with `Luong attention
+#    mechanism(s) <https://arxiv.org/abs/1508.04025>`__
+# -  Jointly train encoder and decoder models using mini-batches
+# -  Implement greedy-search decoding module
+# -  Interact with trained chatbot
+#
+# **Acknowledgments**
+#
+# This tutorial borrows code from the following sources:
+#
+# 1) Yuan-Kuei Wu’s pytorch-chatbot implementation:
+#    https://github.com/ywk991112/pytorch-chatbot
+#
+# 2) Sean Robertson’s practical-pytorch seq2seq-translation example:
+#    https://github.com/spro/practical-pytorch/tree/master/seq2seq-translation
+#
+# 3) FloydHub Cornell Movie Corpus preprocessing code:
+#    https://github.com/floydhub/textutil-preprocess-cornell-movie-corpus
+#
+
+
+######################################################################
+# Preparations
+# ------------
+#
+# To get started, `download <https://zissou.infosci.cornell.edu/convokit/datasets/movie-corpus/movie-corpus.zip>`__ the Movie-Dialogs Corpus zip file.
+
+# and put in a ``data/`` directory under the current directory.
+#
+# After that, let’s import some necessities.
+#
+
+import torch
+from torch.jit import script, trace
+import torch.nn as nn
+from torch import optim
+import torch.nn.functional as F
+import csv
+import random
+import re
+import os
+import unicodedata
+import codecs
+from io import open
+import itertools
+import math
+import json
+
+
+# If the current `accelerator <https://pytorch.org/docs/stable/torch.html#accelerators>`__ is available,
+# we will use it. Otherwise, we use the CPU.
+device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu"
+print(f"Using {device} device")
+
+
+######################################################################
+# Load & Preprocess Data
+# ----------------------
+#
+# The next step is to reformat our data file and load the data into
+# structures that we can work with.
+#
+# The `Cornell Movie-Dialogs
+# Corpus <https://www.cs.cornell.edu/~cristian/Cornell_Movie-Dialogs_Corpus.html>`__
+# is a rich dataset of movie character dialog:
+#
+# -  220,579 conversational exchanges between 10,292 pairs of movie
+#    characters
+# -  9,035 characters from 617 movies
+# -  304,713 total utterances
+#
+# This dataset is large and diverse, and there is a great variation of
+# language formality, time periods, sentiment, etc. Our hope is that this
+# diversity makes our model robust to many forms of inputs and queries.
+#
+# First, we’ll take a look at some lines of our datafile to see the
+# original format.
+#
+
+corpus_name = "movie-corpus"
+corpus = os.path.join("data", corpus_name)
+
+def printLines(file, n=10):
+    with open(file, 'rb') as datafile:
+        lines = datafile.readlines()
+    for line in lines[:n]:
+        print(line)
+
+printLines(os.path.join(corpus, "utterances.jsonl"))
+
+
+######################################################################
+# Create formatted data file
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# For convenience, we'll create a nicely formatted data file in which each line
+# contains a tab-separated *query sentence* and a *response sentence* pair.
+#
+# The following functions facilitate the parsing of the raw
+# ``utterances.jsonl`` data file.
+#
+# -  ``loadLinesAndConversations`` splits each line of the file into a dictionary of
+#    lines with fields: ``lineID``, ``characterID``, and text and then groups them
+#    into conversations with fields: ``conversationID``, ``movieID``, and lines.
+# -  ``extractSentencePairs`` extracts pairs of sentences from
+#    conversations
+#
+
+# Splits each line of the file to create lines and conversations
+def loadLinesAndConversations(fileName):
+    lines = {}
+    conversations = {}
+    with open(fileName, 'r', encoding='iso-8859-1') as f:
+        for line in f:
+            lineJson = json.loads(line)
+            # Extract fields for line object
+            lineObj = {}
+            lineObj["lineID"] = lineJson["id"]
+            lineObj["characterID"] = lineJson["speaker"]
+            lineObj["text"] = lineJson["text"]
+            lines[lineObj['lineID']] = lineObj
+
+            # Extract fields for conversation object
+            if lineJson["conversation_id"] not in conversations:
+                convObj = {}
+                convObj["conversationID"] = lineJson["conversation_id"]
+                convObj["movieID"] = lineJson["meta"]["movie_id"]
+                convObj["lines"] = [lineObj]
+            else:
+                convObj = conversations[lineJson["conversation_id"]]
+                convObj["lines"].insert(0, lineObj)
+            conversations[convObj["conversationID"]] = convObj
+
+    return lines, conversations
+
+
+# Extracts pairs of sentences from conversations
+def extractSentencePairs(conversations):
+    qa_pairs = []
+    for conversation in conversations.values():
+        # Iterate over all the lines of the conversation
+        for i in range(len(conversation["lines"]) - 1):  # We ignore the last line (no answer for it)
+            inputLine = conversation["lines"][i]["text"].strip()
+            targetLine = conversation["lines"][i+1]["text"].strip()
+            # Filter wrong samples (if one of the lists is empty)
+            if inputLine and targetLine:
+                qa_pairs.append([inputLine, targetLine])
+    return qa_pairs
+
+
+######################################################################
+# Now we’ll call these functions and create the file. We’ll call it
+# ``formatted_movie_lines.txt``.
+#
+
+# Define path to new file
+datafile = os.path.join(corpus, "formatted_movie_lines.txt")
+
+delimiter = '\t'
+# Unescape the delimiter
+delimiter = str(codecs.decode(delimiter, "unicode_escape"))
+
+# Initialize lines dict and conversations dict
+lines = {}
+conversations = {}
+# Load lines and conversations
+print("\nProcessing corpus into lines and conversations...")
+lines, conversations = loadLinesAndConversations(os.path.join(corpus, "utterances.jsonl"))
+
+# Write new csv file
+print("\nWriting newly formatted file...")
+with open(datafile, 'w', encoding='utf-8') as outputfile:
+    writer = csv.writer(outputfile, delimiter=delimiter, lineterminator='\n')
+    for pair in extractSentencePairs(conversations):
+        writer.writerow(pair)
+
+# Print a sample of lines
+print("\nSample lines from file:")
+printLines(datafile)
+
+
+######################################################################
+# Load and trim data
+# ~~~~~~~~~~~~~~~~~~
+#
+# Our next order of business is to create a vocabulary and load
+# query/response sentence pairs into memory.
+#
+# Note that we are dealing with sequences of **words**, which do not have
+# an implicit mapping to a discrete numerical space. Thus, we must create
+# one by mapping each unique word that we encounter in our dataset to an
+# index value.
+#
+# For this we define a ``Voc`` class, which keeps a mapping from words to
+# indexes, a reverse mapping of indexes to words, a count of each word and
+# a total word count. The class provides methods for adding a word to the
+# vocabulary (``addWord``), adding all words in a sentence
+# (``addSentence``) and trimming infrequently seen words (``trim``). More
+# on trimming later.
+#
+
+# Default word tokens
+PAD_token = 0  # Used for padding short sentences
+SOS_token = 1  # Start-of-sentence token
+EOS_token = 2  # End-of-sentence token
+
+class Voc:
+    def __init__(self, name):
+        self.name = name
+        self.trimmed = False
+        self.word2index = {}
+        self.word2count = {}
+        self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
+        self.num_words = 3  # Count SOS, EOS, PAD
+
+    def addSentence(self, sentence):
+        for word in sentence.split(' '):
+            self.addWord(word)
+
+    def addWord(self, word):
+        if word not in self.word2index:
+            self.word2index[word] = self.num_words
+            self.word2count[word] = 1
+            self.index2word[self.num_words] = word
+            self.num_words += 1
+        else:
+            self.word2count[word] += 1
+
+    # Remove words below a certain count threshold
+    def trim(self, min_count):
+        if self.trimmed:
+            return
+        self.trimmed = True
+
+        keep_words = []
+
+        for k, v in self.word2count.items():
+            if v >= min_count:
+                keep_words.append(k)
+
+        print('keep_words {} / {} = {:.4f}'.format(
+            len(keep_words), len(self.word2index), len(keep_words) / len(self.word2index)
+        ))
+
+        # Reinitialize dictionaries
+        self.word2index = {}
+        self.word2count = {}
+        self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
+        self.num_words = 3 # Count default tokens
+
+        for word in keep_words:
+            self.addWord(word)
+
+
+######################################################################
+# Now we can assemble our vocabulary and query/response sentence pairs.
+# Before we are ready to use this data, we must perform some
+# preprocessing.
+#
+# First, we must convert the Unicode strings to ASCII using
+# ``unicodeToAscii``. Next, we should convert all letters to lowercase and
+# trim all non-letter characters except for basic punctuation
+# (``normalizeString``). Finally, to aid in training convergence, we will
+# filter out sentences with length greater than the ``MAX_LENGTH``
+# threshold (``filterPairs``).
+#
+
+MAX_LENGTH = 10  # Maximum sentence length to consider
+
+# Turn a Unicode string to plain ASCII, thanks to
+# https://stackoverflow.com/a/518232/2809427
+def unicodeToAscii(s):
+    return ''.join(
+        c for c in unicodedata.normalize('NFD', s)
+        if unicodedata.category(c) != 'Mn'
+    )
+
+# Lowercase, trim, and remove non-letter characters
+def normalizeString(s):
+    s = unicodeToAscii(s.lower().strip())
+    s = re.sub(r"([.!?])", r" \1", s)
+    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
+    s = re.sub(r"\s+", r" ", s).strip()
+    return s
+
+# Read query/response pairs and return a voc object
+def readVocs(datafile, corpus_name):
+    print("Reading lines...")
+    # Read the file and split into lines
+    lines = open(datafile, encoding='utf-8').\
+        read().strip().split('\n')
+    # Split every line into pairs and normalize
+    pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]
+    voc = Voc(corpus_name)
+    return voc, pairs
+
+# Returns True if both sentences in a pair 'p' are under the MAX_LENGTH threshold
+def filterPair(p):
+    # Input sequences need to preserve the last word for EOS token
+    return len(p[0].split(' ')) < MAX_LENGTH and len(p[1].split(' ')) < MAX_LENGTH
+
+# Filter pairs using the ``filterPair`` condition
+def filterPairs(pairs):
+    return [pair for pair in pairs if filterPair(pair)]
+
+# Using the functions defined above, return a populated voc object and pairs list
+def loadPrepareData(corpus, corpus_name, datafile, save_dir):
+    print("Start preparing training data ...")
+    voc, pairs = readVocs(datafile, corpus_name)
+    print("Read {!s} sentence pairs".format(len(pairs)))
+    pairs = filterPairs(pairs)
+    print("Trimmed to {!s} sentence pairs".format(len(pairs)))
+    print("Counting words...")
+    for pair in pairs:
+        voc.addSentence(pair[0])
+        voc.addSentence(pair[1])
+    print("Counted words:", voc.num_words)
+    return voc, pairs
+
+
+# Load/Assemble voc and pairs
+save_dir = os.path.join("data", "save")
+voc, pairs = loadPrepareData(corpus, corpus_name, datafile, save_dir)
+# Print some pairs to validate
+print("\npairs:")
+for pair in pairs[:10]:
+    print(pair)
+
+
+######################################################################
+# Another tactic that is beneficial to achieving faster convergence during
+# training is trimming rarely used words out of our vocabulary. Decreasing
+# the feature space will also soften the difficulty of the function that
+# the model must learn to approximate. We will do this as a two-step
+# process:
+#
+# 1) Trim words used under ``MIN_COUNT`` threshold using the ``voc.trim``
+#    function.
+#
+# 2) Filter out pairs with trimmed words.
+#
+
+MIN_COUNT = 3    # Minimum word count threshold for trimming
+
+def trimRareWords(voc, pairs, MIN_COUNT):
+    # Trim words used under the MIN_COUNT from the voc
+    voc.trim(MIN_COUNT)
+    # Filter out pairs with trimmed words
+    keep_pairs = []
+    for pair in pairs:
+        input_sentence = pair[0]
+        output_sentence = pair[1]
+        keep_input = True
+        keep_output = True
+        # Check input sentence
+        for word in input_sentence.split(' '):
+            if word not in voc.word2index:
+                keep_input = False
+                break
+        # Check output sentence
+        for word in output_sentence.split(' '):
+            if word not in voc.word2index:
+                keep_output = False
+                break
+
+        # Only keep pairs that do not contain trimmed word(s) in their input or output sentence
+        if keep_input and keep_output:
+            keep_pairs.append(pair)
+
+    print("Trimmed from {} pairs to {}, {:.4f} of total".format(len(pairs), len(keep_pairs), len(keep_pairs) / len(pairs)))
+    return keep_pairs
+
+
+# Trim voc and pairs
+pairs = trimRareWords(voc, pairs, MIN_COUNT)
+
+
+######################################################################
+# Prepare Data for Models
+# -----------------------
+#
+# Although we have put a great deal of effort into preparing and massaging our
+# data into a nice vocabulary object and list of sentence pairs, our models
+# will ultimately expect numerical torch tensors as inputs. One way to
+# prepare the processed data for the models can be found in the `seq2seq
+# translation
+# tutorial <https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html>`__.
+# In that tutorial, we use a batch size of 1, meaning that all we have to
+# do is convert the words in our sentence pairs to their corresponding
+# indexes from the vocabulary and feed this to the models.
+#
+# However, if you’re interested in speeding up training and/or would like
+# to leverage GPU parallelization capabilities, you will need to train
+# with mini-batches.
+#
+# Using mini-batches also means that we must be mindful of the variation
+# of sentence length in our batches. To accommodate sentences of different
+# sizes in the same batch, we will make our batched input tensor of shape
+# *(max_length, batch_size)*, where sentences shorter than the
+# *max_length* are zero padded after an *EOS_token*.
+#
+# If we simply convert our English sentences to tensors by converting
+# words to their indexes(\ ``indexesFromSentence``) and zero-pad, our
+# tensor would have shape *(batch_size, max_length)* and indexing the
+# first dimension would return a full sequence across all time-steps.
+# However, we need to be able to index our batch along time, and across
+# all sequences in the batch. Therefore, we transpose our input batch
+# shape to *(max_length, batch_size)*, so that indexing across the first
+# dimension returns a time step across all sentences in the batch. We
+# handle this transpose implicitly in the ``zeroPadding`` function.
+#
+# .. figure:: /_static/img/chatbot/seq2seq_batches.png
+#    :align: center
+#    :alt: batches
+#
+# The ``inputVar`` function handles the process of converting sentences to
+# tensor, ultimately creating a correctly shaped zero-padded tensor. It
+# also returns a tensor of ``lengths`` for each of the sequences in the
+# batch which will be passed to our decoder later.
+#
+# The ``outputVar`` function performs a similar function to ``inputVar``,
+# but instead of returning a ``lengths`` tensor, it returns a binary mask
+# tensor and a maximum target sentence length. The binary mask tensor has
+# the same shape as the output target tensor, but every element that is a
+# *PAD_token* is 0 and all others are 1.
+#
+# ``batch2TrainData`` simply takes a bunch of pairs and returns the input
+# and target tensors using the aforementioned functions.
+#
+
+def indexesFromSentence(voc, sentence):
+    return [voc.word2index[word] for word in sentence.split(' ')] + [EOS_token]
+
+
+def zeroPadding(l, fillvalue=PAD_token):
+    return list(itertools.zip_longest(*l, fillvalue=fillvalue))
+
+def binaryMatrix(l, value=PAD_token):
+    m = []
+    for i, seq in enumerate(l):
+        m.append([])
+        for token in seq:
+            if token == PAD_token:
+                m[i].append(0)
+            else:
+                m[i].append(1)
+    return m
+
+# Returns padded input sequence tensor and lengths
+def inputVar(l, voc):
+    indexes_batch = [indexesFromSentence(voc, sentence) for sentence in l]
+    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
+    padList = zeroPadding(indexes_batch)
+    padVar = torch.LongTensor(padList)
+    return padVar, lengths
+
+# Returns padded target sequence tensor, padding mask, and max target length
+def outputVar(l, voc):
+    indexes_batch = [indexesFromSentence(voc, sentence) for sentence in l]
+    max_target_len = max([len(indexes) for indexes in indexes_batch])
+    padList = zeroPadding(indexes_batch)
+    mask = binaryMatrix(padList)
+    mask = torch.BoolTensor(mask)
+    padVar = torch.LongTensor(padList)
+    return padVar, mask, max_target_len
+
+# Returns all items for a given batch of pairs
+def batch2TrainData(voc, pair_batch):
+    pair_batch.sort(key=lambda x: len(x[0].split(" ")), reverse=True)
+    input_batch, output_batch = [], []
+    for pair in pair_batch:
+        input_batch.append(pair[0])
+        output_batch.append(pair[1])
+    inp, lengths = inputVar(input_batch, voc)
+    output, mask, max_target_len = outputVar(output_batch, voc)
+    return inp, lengths, output, mask, max_target_len
+
+
+# Example for validation
+small_batch_size = 5
+batches = batch2TrainData(voc, [random.choice(pairs) for _ in range(small_batch_size)])
+input_variable, lengths, target_variable, mask, max_target_len = batches
+
+print("input_variable:", input_variable)
+print("lengths:", lengths)
+print("target_variable:", target_variable)
+print("mask:", mask)
+print("max_target_len:", max_target_len)
+
+
+######################################################################
+# Define Models
+# -------------
+#
+# Seq2Seq Model
+# ~~~~~~~~~~~~~
+#
+# The brains of our chatbot is a sequence-to-sequence (seq2seq) model. The
+# goal of a seq2seq model is to take a variable-length sequence as an
+# input, and return a variable-length sequence as an output using a
+# fixed-sized model.
+#
+# `Sutskever et al. <https://arxiv.org/abs/1409.3215>`__ discovered that
+# by using two separate recurrent neural nets together, we can accomplish
+# this task. One RNN acts as an **encoder**, which encodes a variable
+# length input sequence to a fixed-length context vector. In theory, this
+# context vector (the final hidden layer of the RNN) will contain semantic
+# information about the query sentence that is input to the bot. The
+# second RNN is a **decoder**, which takes an input word and the context
+# vector, and returns a guess for the next word in the sequence and a
+# hidden state to use in the next iteration.
+#
+# .. figure:: /_static/img/chatbot/seq2seq_ts.png
+#    :align: center
+#    :alt: model
+#
+# Image source:
+# https://jeddy92.github.io/JEddy92.github.io/ts_seq2seq_intro/
+#
+
+
+######################################################################
+# Encoder
+# ~~~~~~~
+#
+# The encoder RNN iterates through the input sentence one token
+# (e.g. word) at a time, at each time step outputting an “output” vector
+# and a “hidden state” vector. The hidden state vector is then passed to
+# the next time step, while the output vector is recorded. The encoder
+# transforms the context it saw at each point in the sequence into a set
+# of points in a high-dimensional space, which the decoder will use to
+# generate a meaningful output for the given task.
+#
+# At the heart of our encoder is a multi-layered Gated Recurrent Unit,
+# invented by `Cho et al. <https://arxiv.org/pdf/1406.1078v3.pdf>`__ in
+# 2014. We will use a bidirectional variant of the GRU, meaning that there
+# are essentially two independent RNNs: one that is fed the input sequence
+# in normal sequential order, and one that is fed the input sequence in
+# reverse order. The outputs of each network are summed at each time step.
+# Using a bidirectional GRU will give us the advantage of encoding both
+# past and future contexts.
+#
+# Bidirectional RNN:
+#
+# .. figure:: /_static/img/chatbot/RNN-bidirectional.png
+#    :width: 70%
+#    :align: center
+#    :alt: rnn_bidir
+#
+# Image source: https://colah.github.io/posts/2015-09-NN-Types-FP/
+#
+# Note that an ``embedding`` layer is used to encode our word indices in
+# an arbitrarily sized feature space. For our models, this layer will map
+# each word to a feature space of size *hidden_size*. When trained, these
+# values should encode semantic similarity between similar meaning words.
+#
+# Finally, if passing a padded batch of sequences to an RNN module, we
+# must pack and unpack padding around the RNN pass using
+# ``nn.utils.rnn.pack_padded_sequence`` and
+# ``nn.utils.rnn.pad_packed_sequence`` respectively.
+#
+# **Computation Graph:**
+#
+#    1) Convert word indexes to embeddings.
+#    2) Pack padded batch of sequences for RNN module.
+#    3) Forward pass through GRU.
+#    4) Unpack padding.
+#    5) Sum bidirectional GRU outputs.
+#    6) Return output and final hidden state.
+#
+# **Inputs:**
+#
+# -  ``input_seq``: batch of input sentences; shape=\ *(max_length,
+#    batch_size)*
+# -  ``input_lengths``: list of sentence lengths corresponding to each
+#    sentence in the batch; shape=\ *(batch_size)*
+# -  ``hidden``: hidden state; shape=\ *(n_layers x num_directions,
+#    batch_size, hidden_size)*
+#
+# **Outputs:**
+#
+# -  ``outputs``: output features from the last hidden layer of the GRU
+#    (sum of bidirectional outputs); shape=\ *(max_length, batch_size,
+#    hidden_size)*
+# -  ``hidden``: updated hidden state from GRU; shape=\ *(n_layers x
+#    num_directions, batch_size, hidden_size)*
+#
+#
+
+class EncoderRNN(nn.Module):
+    def __init__(self, hidden_size, embedding, n_layers=1, dropout=0):
+        super(EncoderRNN, self).__init__()
+        self.n_layers = n_layers
+        self.hidden_size = hidden_size
+        self.embedding = embedding
+
+        # Initialize GRU; the input_size and hidden_size parameters are both set to 'hidden_size'
+        #   because our input size is a word embedding with number of features == hidden_size
+        self.gru = nn.GRU(hidden_size, hidden_size, n_layers,
+                          dropout=(0 if n_layers == 1 else dropout), bidirectional=True)
+
+    def forward(self, input_seq, input_lengths, hidden=None):
+        # Convert word indexes to embeddings
+        embedded = self.embedding(input_seq)
+        # Pack padded batch of sequences for RNN module
+        packed = nn.utils.rnn.pack_padded_sequence(embedded, input_lengths)
+        # Forward pass through GRU
+        outputs, hidden = self.gru(packed, hidden)
+        # Unpack padding
+        outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs)
+        # Sum bidirectional GRU outputs
+        outputs = outputs[:, :, :self.hidden_size] + outputs[:, : ,self.hidden_size:]
+        # Return output and final hidden state
+        return outputs, hidden
+
+
+######################################################################
+# Decoder
+# ~~~~~~~
+#
+# The decoder RNN generates the response sentence in a token-by-token
+# fashion. It uses the encoder’s context vectors, and internal hidden
+# states to generate the next word in the sequence. It continues
+# generating words until it outputs an *EOS_token*, representing the end
+# of the sentence. A common problem with a vanilla seq2seq decoder is that
+# if we rely solely on the context vector to encode the entire input
+# sequence’s meaning, it is likely that we will have information loss.
+# This is especially the case when dealing with long input sequences,
+# greatly limiting the capability of our decoder.
+#
+# To combat this, `Bahdanau et al. <https://arxiv.org/abs/1409.0473>`__
+# created an “attention mechanism” that allows the decoder to pay
+# attention to certain parts of the input sequence, rather than using the
+# entire fixed context at every step.
+#
+# At a high level, attention is calculated using the decoder’s current
+# hidden state and the encoder’s outputs. The output attention weights
+# have the same shape as the input sequence, allowing us to multiply them
+# by the encoder outputs, giving us a weighted sum which indicates the
+# parts of encoder output to pay attention to. `Sean
+# Robertson’s <https://github.com/spro>`__ figure describes this very
+# well:
+#
+# .. figure:: /_static/img/chatbot/attn2.png
+#    :align: center
+#    :alt: attn2
+#
+# `Luong et al. <https://arxiv.org/abs/1508.04025>`__ improved upon
+# Bahdanau et al.’s groundwork by creating “Global attention”. The key
+# difference is that with “Global attention”, we consider all of the
+# encoder’s hidden states, as opposed to Bahdanau et al.’s “Local
+# attention”, which only considers the encoder’s hidden state from the
+# current time step. Another difference is that with “Global attention”,
+# we calculate attention weights, or energies, using the hidden state of
+# the decoder from the current time step only. Bahdanau et al.’s attention
+# calculation requires knowledge of the decoder’s state from the previous
+# time step. Also, Luong et al. provides various methods to calculate the
+# attention energies between the encoder output and decoder output which
+# are called “score functions”:
+#
+# .. figure:: /_static/img/chatbot/scores.png
+#    :width: 60%
+#    :align: center
+#    :alt: scores
+#
+# where :math:`h_t` = current target decoder state and :math:`\bar{h}_s` =
+# all encoder states.
+#
+# Overall, the Global attention mechanism can be summarized by the
+# following figure. Note that we will implement the “Attention Layer” as a
+# separate ``nn.Module`` called ``Attn``. The output of this module is a
+# softmax normalized weights tensor of shape *(batch_size, 1,
+# max_length)*.
+#
+# .. figure:: /_static/img/chatbot/global_attn.png
+#    :align: center
+#    :width: 60%
+#    :alt: global_attn
+#
+
+# Luong attention layer
+class Attn(nn.Module):
+    def __init__(self, method, hidden_size):
+        super(Attn, self).__init__()
+        self.method = method
+        if self.method not in ['dot', 'general', 'concat']:
+            raise ValueError(self.method, "is not an appropriate attention method.")
+        self.hidden_size = hidden_size
+        if self.method == 'general':
+            self.attn = nn.Linear(self.hidden_size, hidden_size)
+        elif self.method == 'concat':
+            self.attn = nn.Linear(self.hidden_size * 2, hidden_size)
+            self.v = nn.Parameter(torch.FloatTensor(hidden_size))
+
+    def dot_score(self, hidden, encoder_output):
+        return torch.sum(hidden * encoder_output, dim=2)
+
+    def general_score(self, hidden, encoder_output):
+        energy = self.attn(encoder_output)
+        return torch.sum(hidden * energy, dim=2)
+
+    def concat_score(self, hidden, encoder_output):
+        energy = self.attn(torch.cat((hidden.expand(encoder_output.size(0), -1, -1), encoder_output), 2)).tanh()
+        return torch.sum(self.v * energy, dim=2)
+
+    def forward(self, hidden, encoder_outputs):
+        # Calculate the attention weights (energies) based on the given method
+        if self.method == 'general':
+            attn_energies = self.general_score(hidden, encoder_outputs)
+        elif self.method == 'concat':
+            attn_energies = self.concat_score(hidden, encoder_outputs)
+        elif self.method == 'dot':
+            attn_energies = self.dot_score(hidden, encoder_outputs)
+
+        # Transpose max_length and batch_size dimensions
+        attn_energies = attn_energies.t()
+
+        # Return the softmax normalized probability scores (with added dimension)
+        return F.softmax(attn_energies, dim=1).unsqueeze(1)
+
+
+######################################################################
+# Now that we have defined our attention submodule, we can implement the
+# actual decoder model. For the decoder, we will manually feed our batch
+# one time step at a time. This means that our embedded word tensor and
+# GRU output will both have shape *(1, batch_size, hidden_size)*.
+#
+# **Computation Graph:**
+#
+#    1) Get embedding of current input word.
+#    2) Forward through unidirectional GRU.
+#    3) Calculate attention weights from the current GRU output from (2).
+#    4) Multiply attention weights to encoder outputs to get new "weighted sum" context vector.
+#    5) Concatenate weighted context vector and GRU output using Luong eq. 5.
+#    6) Predict next word using Luong eq. 6 (without softmax).
+#    7) Return output and final hidden state.
+#
+# **Inputs:**
+#
+# -  ``input_step``: one time step (one word) of input sequence batch;
+#    shape=\ *(1, batch_size)*
+# -  ``last_hidden``: final hidden layer of GRU; shape=\ *(n_layers x
+#    num_directions, batch_size, hidden_size)*
+# -  ``encoder_outputs``: encoder model’s output; shape=\ *(max_length,
+#    batch_size, hidden_size)*
+#
+# **Outputs:**
+#
+# -  ``output``: softmax normalized tensor giving probabilities of each
+#    word being the correct next word in the decoded sequence;
+#    shape=\ *(batch_size, voc.num_words)*
+# -  ``hidden``: final hidden state of GRU; shape=\ *(n_layers x
+#    num_directions, batch_size, hidden_size)*
+#
+
+class LuongAttnDecoderRNN(nn.Module):
+    def __init__(self, attn_model, embedding, hidden_size, output_size, n_layers=1, dropout=0.1):
+        super(LuongAttnDecoderRNN, self).__init__()
+
+        # Keep for reference
+        self.attn_model = attn_model
+        self.hidden_size = hidden_size
+        self.output_size = output_size
+        self.n_layers = n_layers
+        self.dropout = dropout
+
+        # Define layers
+        self.embedding = embedding
+        self.embedding_dropout = nn.Dropout(dropout)
+        self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=(0 if n_layers == 1 else dropout))
+        self.concat = nn.Linear(hidden_size * 2, hidden_size)
+        self.out = nn.Linear(hidden_size, output_size)
+
+        self.attn = Attn(attn_model, hidden_size)
+
+    def forward(self, input_step, last_hidden, encoder_outputs):
+        # Note: we run this one step (word) at a time
+        # Get embedding of current input word
+        embedded = self.embedding(input_step)
+        embedded = self.embedding_dropout(embedded)
+        # Forward through unidirectional GRU
+        rnn_output, hidden = self.gru(embedded, last_hidden)
+        # Calculate attention weights from the current GRU output
+        attn_weights = self.attn(rnn_output, encoder_outputs)
+        # Multiply attention weights to encoder outputs to get new "weighted sum" context vector
+        context = attn_weights.bmm(encoder_outputs.transpose(0, 1))
+        # Concatenate weighted context vector and GRU output using Luong eq. 5
+        rnn_output = rnn_output.squeeze(0)
+        context = context.squeeze(1)
+        concat_input = torch.cat((rnn_output, context), 1)
+        concat_output = torch.tanh(self.concat(concat_input))
+        # Predict next word using Luong eq. 6
+        output = self.out(concat_output)
+        output = F.softmax(output, dim=1)
+        # Return output and final hidden state
+        return output, hidden
+
+
+######################################################################
+# Define Training Procedure
+# -------------------------
+#
+# Masked loss
+# ~~~~~~~~~~~
+#
+# Since we are dealing with batches of padded sequences, we cannot simply
+# consider all elements of the tensor when calculating loss. We define
+# ``maskNLLLoss`` to calculate our loss based on our decoder’s output
+# tensor, the target tensor, and a binary mask tensor describing the
+# padding of the target tensor. This loss function calculates the average
+# negative log likelihood of the elements that correspond to a *1* in the
+# mask tensor.
+#
+
+def maskNLLLoss(inp, target, mask):
+    nTotal = mask.sum()
+    crossEntropy = -torch.log(torch.gather(inp, 1, target.view(-1, 1)).squeeze(1))
+    loss = crossEntropy.masked_select(mask).mean()
+    loss = loss.to(device)
+    return loss, nTotal.item()
+
+
+######################################################################
+# Single training iteration
+# ~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# The ``train`` function contains the algorithm for a single training
+# iteration (a single batch of inputs).
+#
+# We will use a couple of clever tricks to aid in convergence:
+#
+# -  The first trick is using **teacher forcing**. This means that at some
+#    probability, set by ``teacher_forcing_ratio``, we use the current
+#    target word as the decoder’s next input rather than using the
+#    decoder’s current guess. This technique acts as training wheels for
+#    the decoder, aiding in more efficient training. However, teacher
+#    forcing can lead to model instability during inference, as the
+#    decoder may not have a sufficient chance to truly craft its own
+#    output sequences during training. Thus, we must be mindful of how we
+#    are setting the ``teacher_forcing_ratio``, and not be fooled by fast
+#    convergence.
+#
+# -  The second trick that we implement is **gradient clipping**. This is
+#    a commonly used technique for countering the “exploding gradient”
+#    problem. In essence, by clipping or thresholding gradients to a
+#    maximum value, we prevent the gradients from growing exponentially
+#    and either overflow (NaN), or overshoot steep cliffs in the cost
+#    function.
+#
+# .. figure:: /_static/img/chatbot/grad_clip.png
+#    :align: center
+#    :width: 60%
+#    :alt: grad_clip
+#
+# Image source: Goodfellow et al. *Deep Learning*. 2016. https://www.deeplearningbook.org/
+#
+# **Sequence of Operations:**
+#
+#    1) Forward pass entire input batch through encoder.
+#    2) Initialize decoder inputs as SOS_token, and hidden state as the encoder's final hidden state.
+#    3) Forward input batch sequence through decoder one time step at a time.
+#    4) If teacher forcing: set next decoder input as the current target; else: set next decoder input as current decoder output.
+#    5) Calculate and accumulate loss.
+#    6) Perform backpropagation.
+#    7) Clip gradients.
+#    8) Update encoder and decoder model parameters.
+#
+#
+# .. Note ::
+#
+#   PyTorch’s RNN modules (``RNN``, ``LSTM``, ``GRU``) can be used like any
+#   other non-recurrent layers by simply passing them the entire input
+#   sequence (or batch of sequences). We use the ``GRU`` layer like this in
+#   the ``encoder``. The reality is that under the hood, there is an
+#   iterative process looping over each time step calculating hidden states.
+#   Alternatively, you can run these modules one time-step at a time. In
+#   this case, we manually loop over the sequences during the training
+#   process like we must do for the ``decoder`` model. As long as you
+#   maintain the correct conceptual model of these modules, implementing
+#   sequential models can be very straightforward.
+#
+#
+
+
+def train(input_variable, lengths, target_variable, mask, max_target_len, encoder, decoder, embedding,
+          encoder_optimizer, decoder_optimizer, batch_size, clip, max_length=MAX_LENGTH):
+
+    # Zero gradients
+    encoder_optimizer.zero_grad()
+    decoder_optimizer.zero_grad()
+
+    # Set device options
+    input_variable = input_variable.to(device)
+    target_variable = target_variable.to(device)
+    mask = mask.to(device)
+    # Lengths for RNN packing should always be on the CPU
+    lengths = lengths.to("cpu")
+
+    # Initialize variables
+    loss = 0
+    print_losses = []
+    n_totals = 0
+
+    # Forward pass through encoder
+    encoder_outputs, encoder_hidden = encoder(input_variable, lengths)
+
+    # Create initial decoder input (start with SOS tokens for each sentence)
+    decoder_input = torch.LongTensor([[SOS_token for _ in range(batch_size)]])
+    decoder_input = decoder_input.to(device)
+
+    # Set initial decoder hidden state to the encoder's final hidden state
+    decoder_hidden = encoder_hidden[:decoder.n_layers]
+
+    # Determine if we are using teacher forcing this iteration
+    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
+
+    # Forward batch of sequences through decoder one time step at a time
+    if use_teacher_forcing:
+        for t in range(max_target_len):
+            decoder_output, decoder_hidden = decoder(
+                decoder_input, decoder_hidden, encoder_outputs
+            )
+            # Teacher forcing: next input is current target
+            decoder_input = target_variable[t].view(1, -1)
+            # Calculate and accumulate loss
+            mask_loss, nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t])
+            loss += mask_loss
+            print_losses.append(mask_loss.item() * nTotal)
+            n_totals += nTotal
+    else:
+        for t in range(max_target_len):
+            decoder_output, decoder_hidden = decoder(
+                decoder_input, decoder_hidden, encoder_outputs
+            )
+            # No teacher forcing: next input is decoder's own current output
+            _, topi = decoder_output.topk(1)
+            decoder_input = torch.LongTensor([[topi[i][0] for i in range(batch_size)]])
+            decoder_input = decoder_input.to(device)
+            # Calculate and accumulate loss
+            mask_loss, nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t])
+            loss += mask_loss
+            print_losses.append(mask_loss.item() * nTotal)
+            n_totals += nTotal
+
+    # Perform backpropagation
+    loss.backward()
+
+    # Clip gradients: gradients are modified in place
+    _ = nn.utils.clip_grad_norm_(encoder.parameters(), clip)
+    _ = nn.utils.clip_grad_norm_(decoder.parameters(), clip)
+
+    # Adjust model weights
+    encoder_optimizer.step()
+    decoder_optimizer.step()
+
+    return sum(print_losses) / n_totals
+
+
+######################################################################
+# Training iterations
+# ~~~~~~~~~~~~~~~~~~~
+#
+# It is finally time to tie the full training procedure together with the
+# data. The ``trainIters`` function is responsible for running
+# ``n_iterations`` of training given the passed models, optimizers, data,
+# etc. This function is quite self explanatory, as we have done the heavy
+# lifting with the ``train`` function.
+#
+# One thing to note is that when we save our model, we save a tarball
+# containing the encoder and decoder ``state_dicts`` (parameters), the
+# optimizers’ ``state_dicts``, the loss, the iteration, etc. Saving the model
+# in this way will give us the ultimate flexibility with the checkpoint.
+# After loading a checkpoint, we will be able to use the model parameters
+# to run inference, or we can continue training right where we left off.
+#
+
+def trainIters(model_name, voc, pairs, encoder, decoder, encoder_optimizer, decoder_optimizer, embedding, encoder_n_layers, decoder_n_layers, save_dir, n_iteration, batch_size, print_every, save_every, clip, corpus_name, loadFilename):
+
+    # Load batches for each iteration
+    training_batches = [batch2TrainData(voc, [random.choice(pairs) for _ in range(batch_size)])
+                      for _ in range(n_iteration)]
+
+    # Initializations
+    print('Initializing ...')
+    start_iteration = 1
+    print_loss = 0
+    if loadFilename:
+        start_iteration = checkpoint['iteration'] + 1
+
+    # Training loop
+    print("Training...")
+    for iteration in range(start_iteration, n_iteration + 1):
+        training_batch = training_batches[iteration - 1]
+        # Extract fields from batch
+        input_variable, lengths, target_variable, mask, max_target_len = training_batch
+
+        # Run a training iteration with batch
+        loss = train(input_variable, lengths, target_variable, mask, max_target_len, encoder,
+                     decoder, embedding, encoder_optimizer, decoder_optimizer, batch_size, clip)
+        print_loss += loss
+
+        # Print progress
+        if iteration % print_every == 0:
+            print_loss_avg = print_loss / print_every
+            print("Iteration: {}; Percent complete: {:.1f}%; Average loss: {:.4f}".format(iteration, iteration / n_iteration * 100, print_loss_avg))
+            print_loss = 0
+
+        # Save checkpoint
+        if (iteration % save_every == 0):
+            directory = os.path.join(save_dir, model_name, corpus_name, '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size))
+            if not os.path.exists(directory):
+                os.makedirs(directory)
+            torch.save({
+                'iteration': iteration,
+                'en': encoder.state_dict(),
+                'de': decoder.state_dict(),
+                'en_opt': encoder_optimizer.state_dict(),
+                'de_opt': decoder_optimizer.state_dict(),
+                'loss': loss,
+                'voc_dict': voc.__dict__,
+                'embedding': embedding.state_dict()
+            }, os.path.join(directory, '{}_{}.tar'.format(iteration, 'checkpoint')))
+
+
+######################################################################
+# Define Evaluation
+# -----------------
+#
+# After training a model, we want to be able to talk to the bot ourselves.
+# First, we must define how we want the model to decode the encoded input.
+#
+# Greedy decoding
+# ~~~~~~~~~~~~~~~
+#
+# Greedy decoding is the decoding method that we use during training when
+# we are **NOT** using teacher forcing. In other words, for each time
+# step, we simply choose the word from ``decoder_output`` with the highest
+# softmax value. This decoding method is optimal on a single time-step
+# level.
+#
+# To facilitate the greedy decoding operation, we define a
+# ``GreedySearchDecoder`` class. When run, an object of this class takes
+# an input sequence (``input_seq``) of shape *(input_seq length, 1)*, a
+# scalar input length (``input_length``) tensor, and a ``max_length`` to
+# bound the response sentence length. The input sentence is evaluated
+# using the following computational graph:
+#
+# **Computation Graph:**
+#
+#    1) Forward input through encoder model.
+#    2) Prepare encoder's final hidden layer to be first hidden input to the decoder.
+#    3) Initialize decoder's first input as SOS_token.
+#    4) Initialize tensors to append decoded words to.
+#    5) Iteratively decode one word token at a time:
+#        a) Forward pass through decoder.
+#        b) Obtain most likely word token and its softmax score.
+#        c) Record token and score.
+#        d) Prepare current token to be next decoder input.
+#    6) Return collections of word tokens and scores.
+#
+
+class GreedySearchDecoder(nn.Module):
+    def __init__(self, encoder, decoder):
+        super(GreedySearchDecoder, self).__init__()
+        self.encoder = encoder
+        self.decoder = decoder
+
+    def forward(self, input_seq, input_length, max_length):
+        # Forward input through encoder model
+        encoder_outputs, encoder_hidden = self.encoder(input_seq, input_length)
+        # Prepare encoder's final hidden layer to be first hidden input to the decoder
+        decoder_hidden = encoder_hidden[:self.decoder.n_layers]
+        # Initialize decoder input with SOS_token
+        decoder_input = torch.ones(1, 1, device=device, dtype=torch.long) * SOS_token
+        # Initialize tensors to append decoded words to
+        all_tokens = torch.zeros([0], device=device, dtype=torch.long)
+        all_scores = torch.zeros([0], device=device)
+        # Iteratively decode one word token at a time
+        for _ in range(max_length):
+            # Forward pass through decoder
+            decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden, encoder_outputs)
+            # Obtain most likely word token and its softmax score
+            decoder_scores, decoder_input = torch.max(decoder_output, dim=1)
+            # Record token and score
+            all_tokens = torch.cat((all_tokens, decoder_input), dim=0)
+            all_scores = torch.cat((all_scores, decoder_scores), dim=0)
+            # Prepare current token to be next decoder input (add a dimension)
+            decoder_input = torch.unsqueeze(decoder_input, 0)
+        # Return collections of word tokens and scores
+        return all_tokens, all_scores
+
+
+######################################################################
+# Evaluate my text
+# ~~~~~~~~~~~~~~~~
+#
+# Now that we have our decoding method defined, we can write functions for
+# evaluating a string input sentence. The ``evaluate`` function manages
+# the low-level process of handling the input sentence. We first format
+# the sentence as an input batch of word indexes with *batch_size==1*. We
+# do this by converting the words of the sentence to their corresponding
+# indexes, and transposing the dimensions to prepare the tensor for our
+# models. We also create a ``lengths`` tensor which contains the length of
+# our input sentence. In this case, ``lengths`` is scalar because we are
+# only evaluating one sentence at a time (batch_size==1). Next, we obtain
+# the decoded response sentence tensor using our ``GreedySearchDecoder``
+# object (``searcher``). Finally, we convert the response’s indexes to
+# words and return the list of decoded words.
+#
+# ``evaluateInput`` acts as the user interface for our chatbot. When
+# called, an input text field will spawn in which we can enter our query
+# sentence. After typing our input sentence and pressing *Enter*, our text
+# is normalized in the same way as our training data, and is ultimately
+# fed to the ``evaluate`` function to obtain a decoded output sentence. We
+# loop this process, so we can keep chatting with our bot until we enter
+# either “q” or “quit”.
+#
+# Finally, if a sentence is entered that contains a word that is not in
+# the vocabulary, we handle this gracefully by printing an error message
+# and prompting the user to enter another sentence.
+#
+
+def evaluate(encoder, decoder, searcher, voc, sentence, max_length=MAX_LENGTH):
+    ### Format input sentence as a batch
+    # words -> indexes
+    indexes_batch = [indexesFromSentence(voc, sentence)]
+    # Create lengths tensor
+    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
+    # Transpose dimensions of batch to match models' expectations
+    input_batch = torch.LongTensor(indexes_batch).transpose(0, 1)
+    # Use appropriate device
+    input_batch = input_batch.to(device)
+    lengths = lengths.to("cpu")
+    # Decode sentence with searcher
+    tokens, scores = searcher(input_batch, lengths, max_length)
+    # indexes -> words
+    decoded_words = [voc.index2word[token.item()] for token in tokens]
+    return decoded_words
+
+
+def evaluateInput(encoder, decoder, searcher, voc):
+    input_sentence = ''
+    while(1):
+        try:
+            # Get input sentence
+            input_sentence = input('> ')
+            # Check if it is quit case
+            if input_sentence == 'q' or input_sentence == 'quit': break
+            # Normalize sentence
+            input_sentence = normalizeString(input_sentence)
+            # Evaluate sentence
+            output_words = evaluate(encoder, decoder, searcher, voc, input_sentence)
+            # Format and print response sentence
+            output_words[:] = [x for x in output_words if not (x == 'EOS' or x == 'PAD')]
+            print('Bot:', ' '.join(output_words))
+
+        except KeyError:
+            print("Error: Encountered unknown word.")
+
+
+######################################################################
+# Run Model
+# ---------
+#
+# Finally, it is time to run our model!
+#
+# Regardless of whether we want to train or test the chatbot model, we
+# must initialize the individual encoder and decoder models. In the
+# following block, we set our desired configurations, choose to start from
+# scratch or set a checkpoint to load from, and build and initialize the
+# models. Feel free to play with different model configurations to
+# optimize performance.
+#
+
+# Configure models
+model_name = 'cb_model'
+attn_model = 'dot'
+#``attn_model = 'general'``
+#``attn_model = 'concat'``
+hidden_size = 500
+encoder_n_layers = 2
+decoder_n_layers = 2
+dropout = 0.1
+batch_size = 64
+
+# Set checkpoint to load from; set to None if starting from scratch
+loadFilename = None
+checkpoint_iter = 4000
+
+#############################################################
+# Sample code to load from a checkpoint:
+#
+# .. code-block:: python
+#
+#    loadFilename = os.path.join(save_dir, model_name, corpus_name,
+#                        '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size),
+#                        '{}_checkpoint.tar'.format(checkpoint_iter))
+
+# Load model if a ``loadFilename`` is provided
+if loadFilename:
+    # If loading on same machine the model was trained on
+    checkpoint = torch.load(loadFilename)
+    # If loading a model trained on GPU to CPU
+    #checkpoint = torch.load(loadFilename, map_location=torch.device('cpu'))
+    encoder_sd = checkpoint['en']
+    decoder_sd = checkpoint['de']
+    encoder_optimizer_sd = checkpoint['en_opt']
+    decoder_optimizer_sd = checkpoint['de_opt']
+    embedding_sd = checkpoint['embedding']
+    voc.__dict__ = checkpoint['voc_dict']
+
+
+print('Building encoder and decoder ...')
+# Initialize word embeddings
+embedding = nn.Embedding(voc.num_words, hidden_size)
+if loadFilename:
+    embedding.load_state_dict(embedding_sd)
+# Initialize encoder & decoder models
+encoder = EncoderRNN(hidden_size, embedding, encoder_n_layers, dropout)
+decoder = LuongAttnDecoderRNN(attn_model, embedding, hidden_size, voc.num_words, decoder_n_layers, dropout)
+if loadFilename:
+    encoder.load_state_dict(encoder_sd)
+    decoder.load_state_dict(decoder_sd)
+# Use appropriate device
+encoder = encoder.to(device)
+decoder = decoder.to(device)
+print('Models built and ready to go!')
+
+
+######################################################################
+# Run Training
+# ~~~~~~~~~~~~
+#
+# Run the following block if you want to train the model.
+#
+# First we set training parameters, then we initialize our optimizers, and
+# finally we call the ``trainIters`` function to run our training
+# iterations.
+#
+
+# Configure training/optimization
+clip = 50.0
+teacher_forcing_ratio = 1.0
+learning_rate = 0.0001
+decoder_learning_ratio = 5.0
+n_iteration = 4000
+print_every = 1
+save_every = 500
+
+# Ensure dropout layers are in train mode
+encoder.train()
+decoder.train()
+
+# Initialize optimizers
+print('Building optimizers ...')
+encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
+decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate * decoder_learning_ratio)
+if loadFilename:
+    encoder_optimizer.load_state_dict(encoder_optimizer_sd)
+    decoder_optimizer.load_state_dict(decoder_optimizer_sd)
+
+# If you have an accelerator, configure it to call
+for state in encoder_optimizer.state.values():
+    for k, v in state.items():
+        if isinstance(v, torch.Tensor):
+            state[k] = v.to(device)
+
+for state in decoder_optimizer.state.values():
+    for k, v in state.items():
+        if isinstance(v, torch.Tensor):
+            state[k] = v.to(device)
+
+# Run training iterations
+print("Starting Training!")
+trainIters(model_name, voc, pairs, encoder, decoder, encoder_optimizer, decoder_optimizer,
+           embedding, encoder_n_layers, decoder_n_layers, save_dir, n_iteration, batch_size,
+           print_every, save_every, clip, corpus_name, loadFilename)
+
+
+######################################################################
+# Run Evaluation
+# ~~~~~~~~~~~~~~
+#
+# To chat with your model, run the following block.
+#
+
+# Set dropout layers to ``eval`` mode
+encoder.eval()
+decoder.eval()
+
+# Initialize search module
+searcher = GreedySearchDecoder(encoder, decoder)
+
+# Begin chatting (uncomment and run the following line to begin)
+# evaluateInput(encoder, decoder, searcher, voc)
+
+
+######################################################################
+# Conclusion
+# ----------
+#
+# That’s all for this one, folks. Congratulations, you now know the
+# fundamentals to building a generative chatbot model! If you’re
+# interested, you can try tailoring the chatbot’s behavior by tweaking the
+# model and training parameters and customizing the data that you train
+# the model on.
+#
+# Check out the other tutorials for more cool deep learning applications
+# in PyTorch!
+#
diff --git a/beginner_source/colab.rst b/beginner_source/colab.rst
new file mode 100644
index 00000000000..e5106a2c81a
--- /dev/null
+++ b/beginner_source/colab.rst
@@ -0,0 +1,103 @@
+Running Tutorials in Google Colab
+=================================
+
+When you run a tutorial in Google Colab, there might be additional
+requirements and dependencies that you need to meet in order
+for the tutorial to work properly. This section contains notes on how to
+configure various settings in order to successfully
+run PyTorch tutorials in Google Colab.
+
+PyTorch Version in Google Colab
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Wen you are running a tutorial that requires a version of PyTorch that has
+just been released, that version might not be yet available in Google Colab.
+To check that you have the required ``torch`` and compatible domain libraries
+installed, run ``!pip list``.
+
+If the installed version of PyTorch is lower than required,
+uninstall it and reinstall again by running the following commands:
+
+.. code-block:: python
+
+   !pip3 uninstall --yes torch torchaudio torchvision torchtext torchdata
+   !pip3 install torch torchaudio torchvision torchtext torchdata
+
+Using Tutorial Data from Google Drive in Colab
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+We've added a new feature to tutorials that allows users to open the
+notebook associated with a tutorial in Google Colab. You may need to
+copy data to your Google drive account to get the more complex tutorials
+to work.
+
+In this example, we'll demonstrate how to change the notebook in Colab
+to work with the Chatbot Tutorial. To do this, you'll first need to be
+logged into Google Drive. (For a full description of how to access data
+in Colab, you can view their example notebook
+`here <https://colab.research.google.com/notebooks/io.ipynb#scrollTo=XDg9OBaYqRMd>`__.)
+
+To get started open the `Chatbot
+Tutorial <https://pytorch.org/tutorials/beginner/chatbot_tutorial.html>`__
+in your browser.
+
+At the top of the page click **Run in Google Colab**.
+
+The file will open in Colab.
+
+If you select **Runtime**, and then **Run All**, you'll get an error as the
+file can't be found.
+
+To fix this, we'll copy the required file into our Google Drive account.
+
+1. Log into Google Drive.
+2. In Google Drive, make a folder named ``data``, with a subfolder named
+   ``cornell``.
+3. Visit the Cornell Movie Dialogs Corpus and download the movie-corpus ZIP file.
+4. Unzip the file on your local machine.
+5. Copy the file ``utterances.jsonl`` to the ``data/cornell`` folder that you
+   created in Google Drive.
+
+Now we'll need to edit the file in\_ \_Colab to point to the file on
+Google Drive.
+
+In Colab, add the following to top of the code section over the line
+that begins ``corpus\_name``:
+
+::
+
+    from google.colab import drive
+    drive.mount('/content/gdrive')
+
+Change the two lines that follow:
+
+1. Change the ``corpus\_name`` value to ``"cornell"``.
+2. Change the line that begins with ``corpus`` to this:
+
+::
+
+    corpus = os.path.join("/content/gdrive/My Drive/data", corpus_name)
+
+We're now pointing to the file we uploaded to Drive.
+
+Now when you click the **Run cell** button for the code section,
+you'll be prompted to authorize Google Drive and you'll get an
+authorization code. Paste the code into the prompt in Colab and you
+should be set.
+
+Rerun the notebook from the **Runtime** / **Run All** menu command and
+you'll see it process. (Note that this tutorial takes a long time to
+run.)
+
+Hopefully this example will give you a good starting point for running
+some of the more complex tutorials in Colab. As we evolve our use of
+Colab on the PyTorch tutorials site, we'll look at ways to make this
+easier for users.
+
+Enabling CUDA
+~~~~~~~~~~~~~~~~
+Some tutorials require a CUDA-enabled device (NVIDIA GPU), which involves
+changing the Runtime type prior to executing the tutorial.
+To change the Runtime in Google Colab, on the top drop-down menu select **Runtime**,
+then select **Change runtime type**. Under **Hardware accelerator**, select ``T4 GPU``,
+then click ``Save``.
diff --git a/beginner_source/data_loading_tutorial.py b/beginner_source/data_loading_tutorial.py
index 1889358a29a..8f21bb7bff6 100644
--- a/beginner_source/data_loading_tutorial.py
+++ b/beginner_source/data_loading_tutorial.py
@@ -1,16 +1,16 @@
 # -*- coding: utf-8 -*-
 """
-Data Loading and Processing Tutorial
-====================================
+Writing Custom Datasets, DataLoaders and Transforms
+===================================================
 **Author**: `Sasank Chilamkurthy <https://chsasank.github.io>`_
 
-A lot of effort in solving any machine learning problem goes in to
+A lot of effort in solving any machine learning problem goes into
 preparing the data. PyTorch provides many tools to make data loading
 easy and hopefully, to make your code more readable. In this tutorial,
 we will see how to load and preprocess/augment data from a non trivial
 dataset.
 
-To run this tutorial, please make sure the following pacakges are
+To run this tutorial, please make sure the following packages are
 installed:
 
 -  ``scikit-image``: For image io and transforms
@@ -18,7 +18,6 @@
 
 """
 
-from __future__ import print_function, division
 import os
 import torch
 import pandas as pd
@@ -45,30 +44,31 @@
 #
 # .. note::
 #     Download the dataset from `here <https://download.pytorch.org/tutorial/faces.zip>`_
-#     so that the images are in a directory named 'faces/'.
+#     so that the images are in a directory named 'data/faces/'.
 #     This dataset was actually
 #     generated by applying excellent `dlib's pose
-#     estimation <http://blog.dlib.net/2014/08/real-time-face-pose-estimation.html>`__
+#     estimation <https://blog.dlib.net/2014/08/real-time-face-pose-estimation.html>`__
 #     on a few images from imagenet tagged as 'face'.
 #
-# Dataset comes with a csv file with annotations which looks like this:
+# Dataset comes with a ``.csv`` file with annotations which looks like this:
 #
-# ::
+# .. code-block:: sh
 #
 #     image_name,part_0_x,part_0_y,part_1_x,part_1_y,part_2_x, ... ,part_67_x,part_67_y
 #     0805personali01.jpg,27,83,27,98, ... 84,134
 #     1084239450_e76e00b7e7.jpg,70,236,71,257, ... ,128,312
 #
-# Let's quickly read the CSV and get the annotations in an (N, 2) array where N
-# is the number of landmarks.
+# Let's take a single image name and its annotations from the CSV, in this case row index number 65
+# for person-7.jpg just as an example. Read it, store the image name in ``img_name`` and store its
+# annotations in an (L, 2) array ``landmarks`` where L is the number of landmarks in that row.
 #
 
-landmarks_frame = pd.read_csv('faces/face_landmarks.csv')
+landmarks_frame = pd.read_csv('data/faces/face_landmarks.csv')
 
 n = 65
-img_name = landmarks_frame.ix[n, 0]
-landmarks = landmarks_frame.ix[n, 1:].as_matrix().astype('float')
-landmarks = landmarks.reshape(-1, 2)
+img_name = landmarks_frame.iloc[n, 0]
+landmarks = landmarks_frame.iloc[n, 1:]
+landmarks = np.asarray(landmarks, dtype=float).reshape(-1, 2)
 
 print('Image name: {}'.format(img_name))
 print('Landmarks shape: {}'.format(landmarks.shape))
@@ -87,7 +87,7 @@ def show_landmarks(image, landmarks):
     plt.pause(0.001)  # pause a bit so that plots are updated
 
 plt.figure()
-show_landmarks(io.imread(os.path.join('faces/', img_name)),
+show_landmarks(io.imread(os.path.join('data/faces/', img_name)),
                landmarks)
 plt.show()
 
@@ -103,7 +103,7 @@ def show_landmarks(image, landmarks):
 #
 # -  ``__len__`` so that ``len(dataset)`` returns the size of the dataset.
 # -  ``__getitem__`` to support the indexing such that ``dataset[i]`` can
-#    be used to get :math:`i`\ th sample
+#    be used to get :math:`i`\ th sample.
 #
 # Let's create a dataset class for our face landmarks dataset. We will
 # read the csv in ``__init__`` but leave the reading of images to
@@ -111,7 +111,7 @@ def show_landmarks(image, landmarks):
 # stored in the memory at once but read as required.
 #
 # Sample of our dataset will be a dict
-# ``{'image': image, 'landmarks': landmarks}``. Our datset will take an
+# ``{'image': image, 'landmarks': landmarks}``. Our dataset will take an
 # optional argument ``transform`` so that any required processing can be
 # applied on the sample. We will see the usefulness of ``transform`` in the
 # next section.
@@ -122,7 +122,7 @@ class FaceLandmarksDataset(Dataset):
 
     def __init__(self, csv_file, root_dir, transform=None):
         """
-        Args:
+        Arguments:
             csv_file (string): Path to the csv file with annotations.
             root_dir (string): Directory with all the images.
             transform (callable, optional): Optional transform to be applied
@@ -136,10 +136,14 @@ def __len__(self):
         return len(self.landmarks_frame)
 
     def __getitem__(self, idx):
-        img_name = os.path.join(self.root_dir, self.landmarks_frame.ix[idx, 0])
+        if torch.is_tensor(idx):
+            idx = idx.tolist()
+
+        img_name = os.path.join(self.root_dir,
+                                self.landmarks_frame.iloc[idx, 0])
         image = io.imread(img_name)
-        landmarks = landmarks_frame.ix[idx, 1:].as_matrix().astype('float')
-        landmarks = landmarks.reshape(-1, 2)
+        landmarks = self.landmarks_frame.iloc[idx, 1:]
+        landmarks = np.array([landmarks], dtype=float).reshape(-1, 2)
         sample = {'image': image, 'landmarks': landmarks}
 
         if self.transform:
@@ -153,14 +157,12 @@ def __getitem__(self, idx):
 # will print the sizes of first 4 samples and show their landmarks.
 #
 
-face_dataset = FaceLandmarksDataset(csv_file='faces/face_landmarks.csv',
-                                    root_dir='faces/')
+face_dataset = FaceLandmarksDataset(csv_file='data/faces/face_landmarks.csv',
+                                    root_dir='data/faces/')
 
 fig = plt.figure()
 
-for i in range(len(face_dataset)):
-    sample = face_dataset[i]
-
+for i, sample in enumerate(face_dataset):
     print(i, sample['image'].shape, sample['landmarks'].shape)
 
     ax = plt.subplot(1, 4, i + 1)
@@ -178,9 +180,9 @@ def __getitem__(self, idx):
 # Transforms
 # ----------
 #
-# One issue we can see from the above is that the samples are not of the 
+# One issue we can see from the above is that the samples are not of the
 # same size. Most neural networks expect the images of a fixed size.
-# Therefore, we will need to write some prepocessing code.
+# Therefore, we will need to write some preprocessing code.
 # Let's create three transforms:
 #
 # -  ``Rescale``: to scale the image
@@ -190,11 +192,11 @@ def __getitem__(self, idx):
 #    swap axes).
 #
 # We will write them as callable classes instead of simple functions so
-# that parameters of the transform need not be passed everytime it's
-# called. For this, we just need to implement ``__call__`` method and 
+# that parameters of the transform need not be passed every time it's
+# called. For this, we just need to implement ``__call__`` method and
 # if required, ``__init__`` method. We can then use a transform like this:
 #
-# ::
+# .. code-block:: python
 #
 #     tsfm = Transform(params)
 #     transformed_sample = tsfm(sample)
@@ -207,7 +209,7 @@ class Rescale(object):
     """Rescale the image in a sample to a given size.
 
     Args:
-        output_size (tuple or tuple): Desired output size. If tuple, output is
+        output_size (tuple or int): Desired output size. If tuple, output is
             matched to output_size. If int, smaller of image edges is matched
             to output_size keeping aspect ratio the same.
     """
@@ -261,8 +263,8 @@ def __call__(self, sample):
         h, w = image.shape[:2]
         new_h, new_w = self.output_size
 
-        top = np.random.randint(0, h - new_h)
-        left = np.random.randint(0, w - new_w)
+        top = np.random.randint(0, h - new_h + 1)
+        left = np.random.randint(0, w - new_w + 1)
 
         image = image[top: top + new_h,
                       left: left + new_w]
@@ -277,20 +279,26 @@ class ToTensor(object):
 
     def __call__(self, sample):
         image, landmarks = sample['image'], sample['landmarks']
- 
+
         # swap color axis because
         # numpy image: H x W x C
-        # torch image: C X H X W
+        # torch image: C x H x W
         image = image.transpose((2, 0, 1))
         return {'image': torch.from_numpy(image),
                 'landmarks': torch.from_numpy(landmarks)}
 
+######################################################################
+# .. note::
+#     In the example above, `RandomCrop` uses an external library's random number generator
+#     (in this case, Numpy's `np.random.int`). This can result in unexpected behavior with `DataLoader`
+#     (see `here <https://pytorch.org/docs/stable/notes/faq.html#my-data-loader-workers-return-identical-random-numbers>`_).
+#     In practice, it is safer to stick to PyTorch's random number generator, e.g. by using `torch.randint` instead.
 
 ######################################################################
 # Compose transforms
 # ~~~~~~~~~~~~~~~~~~
 #
-# Now, we apply the transforms on an sample.
+# Now, we apply the transforms on a sample.
 #
 # Let's say we want to rescale the shorter side of the image to 256 and
 # then randomly crop a square of size 224 from it. i.e, we want to compose
@@ -323,29 +331,27 @@ def __call__(self, sample):
 # -----------------------------
 #
 # Let's put this all together to create a dataset with composed
-# transforms. 
+# transforms.
 # To summarize, every time this dataset is sampled:
 #
 # -  An image is read from the file on the fly
 # -  Transforms are applied on the read image
-# -  Since one of the transforms is random, data is augmentated on
+# -  Since one of the transforms is random, data is augmented on
 #    sampling
 #
 # We can iterate over the created dataset with a ``for i in range``
 # loop as before.
 #
 
-transformed_dataset = FaceLandmarksDataset(csv_file='faces/face_landmarks.csv',
-                                           root_dir='faces/',
+transformed_dataset = FaceLandmarksDataset(csv_file='data/faces/face_landmarks.csv',
+                                           root_dir='data/faces/',
                                            transform=transforms.Compose([
                                                Rescale(256),
                                                RandomCrop(224),
                                                ToTensor()
                                            ]))
 
-for i in range(len(transformed_dataset)):
-    sample = transformed_dataset[i]
-
+for i, sample in enumerate(transformed_dataset):
     print(i, sample['image'].size(), sample['landmarks'].size())
 
     if i == 3:
@@ -368,7 +374,7 @@ def __call__(self, sample):
 #
 
 dataloader = DataLoader(transformed_dataset, batch_size=4,
-                        shuffle=True, num_workers=4)
+                        shuffle=True, num_workers=0)
 
 
 # Helper function to show a batch
@@ -378,17 +384,22 @@ def show_landmarks_batch(sample_batched):
             sample_batched['image'], sample_batched['landmarks']
     batch_size = len(images_batch)
     im_size = images_batch.size(2)
+    grid_border_size = 2
 
     grid = utils.make_grid(images_batch)
     plt.imshow(grid.numpy().transpose((1, 2, 0)))
 
     for i in range(batch_size):
-        plt.scatter(landmarks_batch[i, :, 0].numpy() + i * im_size,
-                    landmarks_batch[i, :, 1].numpy(),
+        plt.scatter(landmarks_batch[i, :, 0].numpy() + i * im_size + (i + 1) * grid_border_size,
+                    landmarks_batch[i, :, 1].numpy() + grid_border_size,
                     s=10, marker='.', c='r')
 
         plt.title('Batch from dataloader')
 
+# if you are using Windows, uncomment the next line and indent the for loop.
+# you might need to go back and change ``num_workers`` to 0.
+
+# if __name__ == '__main__':
 for i_batch, sample_batched in enumerate(dataloader):
     print(i_batch, sample_batched['image'].size(),
           sample_batched['landmarks'].size())
@@ -410,7 +421,9 @@ def show_landmarks_batch(sample_batched):
 # and dataloader. ``torchvision`` package provides some common datasets and
 # transforms. You might not even have to write custom classes. One of the
 # more generic datasets available in torchvision is ``ImageFolder``.
-# It assumes that images are organized in the following way: ::
+# It assumes that images are organized in the following way:
+#
+# .. code-block:: sh
 #
 #     root/ants/xxx.png
 #     root/ants/xxy.jpeg
@@ -424,23 +437,25 @@ def show_landmarks_batch(sample_batched):
 #
 # where 'ants', 'bees' etc. are class labels. Similarly generic transforms
 # which operate on ``PIL.Image`` like  ``RandomHorizontalFlip``, ``Scale``,
-# are also avaiable. You can use these to write a dataloader like this: ::
-#
-#   import torch
-#   from torchvision import transforms, datasets
-#
-#   data_transform = transforms.Compose([
-#           transforms.RandomSizedCrop(224),
-#           transforms.RandomHorizontalFlip(),
-#           transforms.ToTensor(),
-#           transforms.Normalize(mean=[0.485, 0.456, 0.406],
-#                                std=[0.229, 0.224, 0.225])
-#       ])
-#   hymenoptera_dataset = datasets.ImageFolder(root='hymenoptera_data/train',
-#                                              transform=data_transform)
-#   dataset_loader = torch.utils.data.DataLoader(hymenoptera_dataset,
-#                                                batch_size=4, shuffle=True,
-#                                                num_workers=4)
+# are also available. You can use these to write a dataloader like this:
+#
+# .. code-block:: pytorch
+#
+#    import torch
+#    from torchvision import transforms, datasets
+#
+#    data_transform = transforms.Compose([
+#            transforms.RandomResizedCrop(224),
+#            transforms.RandomHorizontalFlip(),
+#            transforms.ToTensor(),
+#            transforms.Normalize(mean=[0.485, 0.456, 0.406],
+#                                 std=[0.229, 0.224, 0.225])
+#        ])
+#    hymenoptera_dataset = datasets.ImageFolder(root='hymenoptera_data/train',
+#                                               transform=data_transform)
+#    dataset_loader = torch.utils.data.DataLoader(hymenoptera_dataset,
+#                                                 batch_size=4, shuffle=True,
+#                                                 num_workers=4)
 #
 # For an example with training code, please see
 # :doc:`transfer_learning_tutorial`.
diff --git a/beginner_source/dcgan_faces_tutorial.py b/beginner_source/dcgan_faces_tutorial.py
new file mode 100644
index 00000000000..e9ac3fdd504
--- /dev/null
+++ b/beginner_source/dcgan_faces_tutorial.py
@@ -0,0 +1,739 @@
+# -*- coding: utf-8 -*-
+"""
+DCGAN Tutorial
+==============
+
+**Author**: `Nathan Inkawhich <https://github.com/inkawhich>`__
+
+"""
+
+
+######################################################################
+# Introduction
+# ------------
+# 
+# This tutorial will give an introduction to DCGANs through an example. We
+# will train a generative adversarial network (GAN) to generate new
+# celebrities after showing it pictures of many real celebrities. Most of
+# the code here is from the DCGAN implementation in
+# `pytorch/examples <https://github.com/pytorch/examples>`__, and this
+# document will give a thorough explanation of the implementation and shed
+# light on how and why this model works. But don’t worry, no prior
+# knowledge of GANs is required, but it may require a first-timer to spend
+# some time reasoning about what is actually happening under the hood.
+# Also, for the sake of time it will help to have a GPU, or two. Lets
+# start from the beginning.
+# 
+# Generative Adversarial Networks
+# -------------------------------
+# 
+# What is a GAN?
+# ~~~~~~~~~~~~~~
+# 
+# GANs are a framework for teaching a deep learning model to capture the training
+# data distribution so we can generate new data from that same
+# distribution. GANs were invented by Ian Goodfellow in 2014 and first
+# described in the paper `Generative Adversarial
+# Nets <https://papers.nips.cc/paper/5423-generative-adversarial-nets.pdf>`__.
+# They are made of two distinct models, a *generator* and a
+# *discriminator*. The job of the generator is to spawn ‘fake’ images that
+# look like the training images. The job of the discriminator is to look
+# at an image and output whether or not it is a real training image or a
+# fake image from the generator. During training, the generator is
+# constantly trying to outsmart the discriminator by generating better and
+# better fakes, while the discriminator is working to become a better
+# detective and correctly classify the real and fake images. The
+# equilibrium of this game is when the generator is generating perfect
+# fakes that look as if they came directly from the training data, and the
+# discriminator is left to always guess at 50% confidence that the
+# generator output is real or fake.
+# 
+# Now, lets define some notation to be used throughout tutorial starting
+# with the discriminator. Let :math:`x` be data representing an image.
+# :math:`D(x)` is the discriminator network which outputs the (scalar)
+# probability that :math:`x` came from training data rather than the
+# generator. Here, since we are dealing with images, the input to
+# :math:`D(x)` is an image of CHW size 3x64x64. Intuitively, :math:`D(x)`
+# should be HIGH when :math:`x` comes from training data and LOW when
+# :math:`x` comes from the generator. :math:`D(x)` can also be thought of
+# as a traditional binary classifier.
+# 
+# For the generator’s notation, let :math:`z` be a latent space vector
+# sampled from a standard normal distribution. :math:`G(z)` represents the
+# generator function which maps the latent vector :math:`z` to data-space.
+# The goal of :math:`G` is to estimate the distribution that the training
+# data comes from (:math:`p_{data}`) so it can generate fake samples from
+# that estimated distribution (:math:`p_g`).
+# 
+# So, :math:`D(G(z))` is the probability (scalar) that the output of the
+# generator :math:`G` is a real image. As described in `Goodfellow’s
+# paper <https://papers.nips.cc/paper/5423-generative-adversarial-nets.pdf>`__,
+# :math:`D` and :math:`G` play a minimax game in which :math:`D` tries to
+# maximize the probability it correctly classifies reals and fakes
+# (:math:`logD(x)`), and :math:`G` tries to minimize the probability that
+# :math:`D` will predict its outputs are fake (:math:`log(1-D(G(z)))`).
+# From the paper, the GAN loss function is
+# 
+# .. math:: \underset{G}{\text{min}} \underset{D}{\text{max}}V(D,G) = \mathbb{E}_{x\sim p_{data}(x)}\big[logD(x)\big] + \mathbb{E}_{z\sim p_{z}(z)}\big[log(1-D(G(z)))\big]
+# 
+# In theory, the solution to this minimax game is where
+# :math:`p_g = p_{data}`, and the discriminator guesses randomly if the
+# inputs are real or fake. However, the convergence theory of GANs is
+# still being actively researched and in reality models do not always
+# train to this point.
+# 
+# What is a DCGAN?
+# ~~~~~~~~~~~~~~~~
+# 
+# A DCGAN is a direct extension of the GAN described above, except that it
+# explicitly uses convolutional and convolutional-transpose layers in the
+# discriminator and generator, respectively. It was first described by
+# Radford et. al. in the paper `Unsupervised Representation Learning With
+# Deep Convolutional Generative Adversarial
+# Networks <https://arxiv.org/pdf/1511.06434.pdf>`__. The discriminator
+# is made up of strided
+# `convolution <https://pytorch.org/docs/stable/nn.html#torch.nn.Conv2d>`__
+# layers, `batch
+# norm <https://pytorch.org/docs/stable/nn.html#torch.nn.BatchNorm2d>`__
+# layers, and
+# `LeakyReLU <https://pytorch.org/docs/stable/nn.html#torch.nn.LeakyReLU>`__
+# activations. The input is a 3x64x64 input image and the output is a
+# scalar probability that the input is from the real data distribution.
+# The generator is comprised of
+# `convolutional-transpose <https://pytorch.org/docs/stable/nn.html#torch.nn.ConvTranspose2d>`__
+# layers, batch norm layers, and
+# `ReLU <https://pytorch.org/docs/stable/nn.html#relu>`__ activations. The
+# input is a latent vector, :math:`z`, that is drawn from a standard
+# normal distribution and the output is a 3x64x64 RGB image. The strided
+# conv-transpose layers allow the latent vector to be transformed into a
+# volume with the same shape as an image. In the paper, the authors also
+# give some tips about how to setup the optimizers, how to calculate the
+# loss functions, and how to initialize the model weights, all of which
+# will be explained in the coming sections.
+# 
+
+#%matplotlib inline
+import argparse
+import os
+import random
+import torch
+import torch.nn as nn
+import torch.nn.parallel
+import torch.optim as optim
+import torch.utils.data
+import torchvision.datasets as dset
+import torchvision.transforms as transforms
+import torchvision.utils as vutils
+import numpy as np
+import matplotlib.pyplot as plt
+import matplotlib.animation as animation
+from IPython.display import HTML
+
+# Set random seed for reproducibility
+manualSeed = 999
+#manualSeed = random.randint(1, 10000) # use if you want new results
+print("Random Seed: ", manualSeed)
+random.seed(manualSeed)
+torch.manual_seed(manualSeed)
+torch.use_deterministic_algorithms(True) # Needed for reproducible results
+
+
+######################################################################
+# Inputs
+# ------
+# 
+# Let’s define some inputs for the run:
+# 
+# -  ``dataroot`` - the path to the root of the dataset folder. We will
+#    talk more about the dataset in the next section.
+# -  ``workers`` - the number of worker threads for loading the data with
+#    the ``DataLoader``.
+# -  ``batch_size`` - the batch size used in training. The DCGAN paper
+#    uses a batch size of 128.
+# -  ``image_size`` - the spatial size of the images used for training.
+#    This implementation defaults to 64x64. If another size is desired,
+#    the structures of D and G must be changed. See
+#    `here <https://github.com/pytorch/examples/issues/70>`__ for more
+#    details.
+# -  ``nc`` - number of color channels in the input images. For color
+#    images this is 3.
+# -  ``nz`` - length of latent vector.
+# -  ``ngf`` - relates to the depth of feature maps carried through the
+#    generator.
+# -  ``ndf`` - sets the depth of feature maps propagated through the
+#    discriminator.
+# -  ``num_epochs`` - number of training epochs to run. Training for
+#    longer will probably lead to better results but will also take much
+#    longer.
+# -  ``lr`` - learning rate for training. As described in the DCGAN paper,
+#    this number should be 0.0002.
+# -  ``beta1`` - beta1 hyperparameter for Adam optimizers. As described in
+#    paper, this number should be 0.5.
+# -  ``ngpu`` - number of GPUs available. If this is 0, code will run in
+#    CPU mode. If this number is greater than 0 it will run on that number
+#    of GPUs.
+#
+
+# Root directory for dataset
+dataroot = "data/celeba"
+
+# Number of workers for dataloader
+workers = 2
+
+# Batch size during training
+batch_size = 128
+
+# Spatial size of training images. All images will be resized to this
+#   size using a transformer.
+image_size = 64
+
+# Number of channels in the training images. For color images this is 3
+nc = 3
+
+# Size of z latent vector (i.e. size of generator input)
+nz = 100
+
+# Size of feature maps in generator
+ngf = 64
+
+# Size of feature maps in discriminator
+ndf = 64
+
+# Number of training epochs
+num_epochs = 5
+
+# Learning rate for optimizers
+lr = 0.0002
+
+# Beta1 hyperparameter for Adam optimizers
+beta1 = 0.5
+
+# Number of GPUs available. Use 0 for CPU mode.
+ngpu = 1
+
+
+######################################################################
+# Data
+# ----
+# 
+# In this tutorial we will use the `Celeb-A Faces
+# dataset <http://mmlab.ie.cuhk.edu.hk/projects/CelebA.html>`__ which can
+# be downloaded at the linked site, or in `Google
+# Drive <https://drive.google.com/drive/folders/0B7EVK8r0v71pTUZsaXdaSnZBZzg>`__.
+# The dataset will download as a file named ``img_align_celeba.zip``. Once
+# downloaded, create a directory named ``celeba`` and extract the zip file
+# into that directory. Then, set the ``dataroot`` input for this notebook to
+# the ``celeba`` directory you just created. The resulting directory
+# structure should be:
+# 
+# .. code-block:: sh
+# 
+#    /path/to/celeba
+#        -> img_align_celeba  
+#            -> 188242.jpg
+#            -> 173822.jpg
+#            -> 284702.jpg
+#            -> 537394.jpg
+#               ...
+# 
+# This is an important step because we will be using the ``ImageFolder``
+# dataset class, which requires there to be subdirectories in the
+# dataset root folder. Now, we can create the dataset, create the
+# dataloader, set the device to run on, and finally visualize some of the
+# training data.
+# 
+
+# We can use an image folder dataset the way we have it setup.
+# Create the dataset
+dataset = dset.ImageFolder(root=dataroot,
+                           transform=transforms.Compose([
+                               transforms.Resize(image_size),
+                               transforms.CenterCrop(image_size),
+                               transforms.ToTensor(),
+                               transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
+                           ]))
+# Create the dataloader
+dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size,
+                                         shuffle=True, num_workers=workers)
+
+# Decide which device we want to run on
+device = torch.device("cuda:0" if (torch.cuda.is_available() and ngpu > 0) else "cpu")
+
+# Plot some training images
+real_batch = next(iter(dataloader))
+plt.figure(figsize=(8,8))
+plt.axis("off")
+plt.title("Training Images")
+plt.imshow(np.transpose(vutils.make_grid(real_batch[0].to(device)[:64], padding=2, normalize=True).cpu(),(1,2,0)))
+plt.show()
+
+
+######################################################################
+# Implementation
+# --------------
+# 
+# With our input parameters set and the dataset prepared, we can now get
+# into the implementation. We will start with the weight initialization
+# strategy, then talk about the generator, discriminator, loss functions,
+# and training loop in detail.
+# 
+# Weight Initialization
+# ~~~~~~~~~~~~~~~~~~~~~
+# 
+# From the DCGAN paper, the authors specify that all model weights shall
+# be randomly initialized from a Normal distribution with ``mean=0``,
+# ``stdev=0.02``. The ``weights_init`` function takes an initialized model as
+# input and reinitializes all convolutional, convolutional-transpose, and
+# batch normalization layers to meet this criteria. This function is
+# applied to the models immediately after initialization.
+# 
+
+# custom weights initialization called on ``netG`` and ``netD``
+def weights_init(m):
+    classname = m.__class__.__name__
+    if classname.find('Conv') != -1:
+        nn.init.normal_(m.weight.data, 0.0, 0.02)
+    elif classname.find('BatchNorm') != -1:
+        nn.init.normal_(m.weight.data, 1.0, 0.02)
+        nn.init.constant_(m.bias.data, 0)
+
+
+######################################################################
+# Generator
+# ~~~~~~~~~
+# 
+# The generator, :math:`G`, is designed to map the latent space vector
+# (:math:`z`) to data-space. Since our data are images, converting
+# :math:`z` to data-space means ultimately creating a RGB image with the
+# same size as the training images (i.e. 3x64x64). In practice, this is
+# accomplished through a series of strided two dimensional convolutional
+# transpose layers, each paired with a 2d batch norm layer and a relu
+# activation. The output of the generator is fed through a tanh function
+# to return it to the input data range of :math:`[-1,1]`. It is worth
+# noting the existence of the batch norm functions after the
+# conv-transpose layers, as this is a critical contribution of the DCGAN
+# paper. These layers help with the flow of gradients during training. An
+# image of the generator from the DCGAN paper is shown below.
+#
+# .. figure:: /_static/img/dcgan_generator.png
+#    :alt: dcgan_generator
+#
+# Notice, how the inputs we set in the input section (``nz``, ``ngf``, and
+# ``nc``) influence the generator architecture in code. ``nz`` is the length
+# of the z input vector, ``ngf`` relates to the size of the feature maps
+# that are propagated through the generator, and ``nc`` is the number of
+# channels in the output image (set to 3 for RGB images). Below is the
+# code for the generator.
+# 
+
+# Generator Code
+
+class Generator(nn.Module):
+    def __init__(self, ngpu):
+        super(Generator, self).__init__()
+        self.ngpu = ngpu
+        self.main = nn.Sequential(
+            # input is Z, going into a convolution
+            nn.ConvTranspose2d( nz, ngf * 8, 4, 1, 0, bias=False),
+            nn.BatchNorm2d(ngf * 8),
+            nn.ReLU(True),
+            # state size. ``(ngf*8) x 4 x 4``
+            nn.ConvTranspose2d(ngf * 8, ngf * 4, 4, 2, 1, bias=False),
+            nn.BatchNorm2d(ngf * 4),
+            nn.ReLU(True),
+            # state size. ``(ngf*4) x 8 x 8``
+            nn.ConvTranspose2d( ngf * 4, ngf * 2, 4, 2, 1, bias=False),
+            nn.BatchNorm2d(ngf * 2),
+            nn.ReLU(True),
+            # state size. ``(ngf*2) x 16 x 16``
+            nn.ConvTranspose2d( ngf * 2, ngf, 4, 2, 1, bias=False),
+            nn.BatchNorm2d(ngf),
+            nn.ReLU(True),
+            # state size. ``(ngf) x 32 x 32``
+            nn.ConvTranspose2d( ngf, nc, 4, 2, 1, bias=False),
+            nn.Tanh()
+            # state size. ``(nc) x 64 x 64``
+        )
+
+    def forward(self, input):
+        return self.main(input)
+
+
+######################################################################
+# Now, we can instantiate the generator and apply the ``weights_init``
+# function. Check out the printed model to see how the generator object is
+# structured.
+# 
+
+# Create the generator
+netG = Generator(ngpu).to(device)
+
+# Handle multi-GPU if desired
+if (device.type == 'cuda') and (ngpu > 1):
+    netG = nn.DataParallel(netG, list(range(ngpu)))
+
+# Apply the ``weights_init`` function to randomly initialize all weights
+#  to ``mean=0``, ``stdev=0.02``.
+netG.apply(weights_init)
+
+# Print the model
+print(netG)
+
+
+######################################################################
+# Discriminator
+# ~~~~~~~~~~~~~
+# 
+# As mentioned, the discriminator, :math:`D`, is a binary classification
+# network that takes an image as input and outputs a scalar probability
+# that the input image is real (as opposed to fake). Here, :math:`D` takes
+# a 3x64x64 input image, processes it through a series of Conv2d,
+# BatchNorm2d, and LeakyReLU layers, and outputs the final probability
+# through a Sigmoid activation function. This architecture can be extended
+# with more layers if necessary for the problem, but there is significance
+# to the use of the strided convolution, BatchNorm, and LeakyReLUs. The
+# DCGAN paper mentions it is a good practice to use strided convolution
+# rather than pooling to downsample because it lets the network learn its
+# own pooling function. Also batch norm and leaky relu functions promote
+# healthy gradient flow which is critical for the learning process of both
+# :math:`G` and :math:`D`.
+# 
+
+#########################################################################
+# Discriminator Code
+
+class Discriminator(nn.Module):
+    def __init__(self, ngpu):
+        super(Discriminator, self).__init__()
+        self.ngpu = ngpu
+        self.main = nn.Sequential(
+            # input is ``(nc) x 64 x 64``
+            nn.Conv2d(nc, ndf, 4, 2, 1, bias=False),
+            nn.LeakyReLU(0.2, inplace=True),
+            # state size. ``(ndf) x 32 x 32``
+            nn.Conv2d(ndf, ndf * 2, 4, 2, 1, bias=False),
+            nn.BatchNorm2d(ndf * 2),
+            nn.LeakyReLU(0.2, inplace=True),
+            # state size. ``(ndf*2) x 16 x 16``
+            nn.Conv2d(ndf * 2, ndf * 4, 4, 2, 1, bias=False),
+            nn.BatchNorm2d(ndf * 4),
+            nn.LeakyReLU(0.2, inplace=True),
+            # state size. ``(ndf*4) x 8 x 8``
+            nn.Conv2d(ndf * 4, ndf * 8, 4, 2, 1, bias=False),
+            nn.BatchNorm2d(ndf * 8),
+            nn.LeakyReLU(0.2, inplace=True),
+            # state size. ``(ndf*8) x 4 x 4``
+            nn.Conv2d(ndf * 8, 1, 4, 1, 0, bias=False),
+            nn.Sigmoid()
+        )
+
+    def forward(self, input):
+        return self.main(input)
+
+
+######################################################################
+# Now, as with the generator, we can create the discriminator, apply the
+# ``weights_init`` function, and print the model’s structure.
+# 
+
+# Create the Discriminator
+netD = Discriminator(ngpu).to(device)
+
+# Handle multi-GPU if desired
+if (device.type == 'cuda') and (ngpu > 1):
+    netD = nn.DataParallel(netD, list(range(ngpu)))
+    
+# Apply the ``weights_init`` function to randomly initialize all weights
+# like this: ``to mean=0, stdev=0.2``.
+netD.apply(weights_init)
+
+# Print the model
+print(netD)
+
+
+######################################################################
+# Loss Functions and Optimizers
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# 
+# With :math:`D` and :math:`G` setup, we can specify how they learn
+# through the loss functions and optimizers. We will use the Binary Cross
+# Entropy loss
+# (`BCELoss <https://pytorch.org/docs/stable/generated/torch.nn.BCELoss.html#torch.nn.BCELoss>`__)
+# function which is defined in PyTorch as:
+# 
+# .. math:: \ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad l_n = - \left[ y_n \cdot \log x_n + (1 - y_n) \cdot \log (1 - x_n) \right]
+# 
+# Notice how this function provides the calculation of both log components
+# in the objective function (i.e. :math:`log(D(x))` and
+# :math:`log(1-D(G(z)))`). We can specify what part of the BCE equation to
+# use with the :math:`y` input. This is accomplished in the training loop
+# which is coming up soon, but it is important to understand how we can
+# choose which component we wish to calculate just by changing :math:`y`
+# (i.e. GT labels).
+# 
+# Next, we define our real label as 1 and the fake label as 0. These
+# labels will be used when calculating the losses of :math:`D` and
+# :math:`G`, and this is also the convention used in the original GAN
+# paper. Finally, we set up two separate optimizers, one for :math:`D` and
+# one for :math:`G`. As specified in the DCGAN paper, both are Adam
+# optimizers with learning rate 0.0002 and Beta1 = 0.5. For keeping track
+# of the generator’s learning progression, we will generate a fixed batch
+# of latent vectors that are drawn from a Gaussian distribution
+# (i.e. fixed_noise) . In the training loop, we will periodically input
+# this fixed_noise into :math:`G`, and over the iterations we will see
+# images form out of the noise.
+# 
+
+# Initialize the ``BCELoss`` function
+criterion = nn.BCELoss()
+
+# Create batch of latent vectors that we will use to visualize
+#  the progression of the generator
+fixed_noise = torch.randn(64, nz, 1, 1, device=device)
+
+# Establish convention for real and fake labels during training
+real_label = 1.
+fake_label = 0.
+
+# Setup Adam optimizers for both G and D
+optimizerD = optim.Adam(netD.parameters(), lr=lr, betas=(beta1, 0.999))
+optimizerG = optim.Adam(netG.parameters(), lr=lr, betas=(beta1, 0.999))
+
+
+######################################################################
+# Training
+# ~~~~~~~~
+# 
+# Finally, now that we have all of the parts of the GAN framework defined,
+# we can train it. Be mindful that training GANs is somewhat of an art
+# form, as incorrect hyperparameter settings lead to mode collapse with
+# little explanation of what went wrong. Here, we will closely follow
+# Algorithm 1 from the `Goodfellow’s paper <https://papers.nips.cc/paper/5423-generative-adversarial-nets.pdf>`__, 
+# while abiding by some of the best
+# practices shown in `ganhacks <https://github.com/soumith/ganhacks>`__.
+# Namely, we will “construct different mini-batches for real and fake”
+# images, and also adjust G’s objective function to maximize
+# :math:`log(D(G(z)))`. Training is split up into two main parts. Part 1
+# updates the Discriminator and Part 2 updates the Generator.
+# 
+# **Part 1 - Train the Discriminator**
+# 
+# Recall, the goal of training the discriminator is to maximize the
+# probability of correctly classifying a given input as real or fake. In
+# terms of Goodfellow, we wish to “update the discriminator by ascending
+# its stochastic gradient”. Practically, we want to maximize
+# :math:`log(D(x)) + log(1-D(G(z)))`. Due to the separate mini-batch
+# suggestion from `ganhacks <https://github.com/soumith/ganhacks>`__,
+# we will calculate this in two steps. First, we
+# will construct a batch of real samples from the training set, forward
+# pass through :math:`D`, calculate the loss (:math:`log(D(x))`), then
+# calculate the gradients in a backward pass. Secondly, we will construct
+# a batch of fake samples with the current generator, forward pass this
+# batch through :math:`D`, calculate the loss (:math:`log(1-D(G(z)))`),
+# and *accumulate* the gradients with a backward pass. Now, with the
+# gradients accumulated from both the all-real and all-fake batches, we
+# call a step of the Discriminator’s optimizer.
+# 
+# **Part 2 - Train the Generator**
+# 
+# As stated in the original paper, we want to train the Generator by
+# minimizing :math:`log(1-D(G(z)))` in an effort to generate better fakes.
+# As mentioned, this was shown by Goodfellow to not provide sufficient
+# gradients, especially early in the learning process. As a fix, we
+# instead wish to maximize :math:`log(D(G(z)))`. In the code we accomplish
+# this by: classifying the Generator output from Part 1 with the
+# Discriminator, computing G’s loss *using real labels as GT*, computing
+# G’s gradients in a backward pass, and finally updating G’s parameters
+# with an optimizer step. It may seem counter-intuitive to use the real
+# labels as GT labels for the loss function, but this allows us to use the
+# :math:`log(x)` part of the ``BCELoss`` (rather than the :math:`log(1-x)`
+# part) which is exactly what we want.
+# 
+# Finally, we will do some statistic reporting and at the end of each
+# epoch we will push our fixed_noise batch through the generator to
+# visually track the progress of G’s training. The training statistics
+# reported are:
+# 
+# -  **Loss_D** - discriminator loss calculated as the sum of losses for
+#    the all real and all fake batches (:math:`log(D(x)) + log(1 - D(G(z)))`).
+# -  **Loss_G** - generator loss calculated as :math:`log(D(G(z)))`
+# -  **D(x)** - the average output (across the batch) of the discriminator
+#    for the all real batch. This should start close to 1 then
+#    theoretically converge to 0.5 when G gets better. Think about why
+#    this is.
+# -  **D(G(z))** - average discriminator outputs for the all fake batch.
+#    The first number is before D is updated and the second number is
+#    after D is updated. These numbers should start near 0 and converge to
+#    0.5 as G gets better. Think about why this is.
+# 
+# **Note:** This step might take a while, depending on how many epochs you
+# run and if you removed some data from the dataset.
+# 
+
+# Training Loop
+
+# Lists to keep track of progress
+img_list = []
+G_losses = []
+D_losses = []
+iters = 0
+
+print("Starting Training Loop...")
+# For each epoch
+for epoch in range(num_epochs):
+    # For each batch in the dataloader
+    for i, data in enumerate(dataloader, 0):
+        
+        ############################
+        # (1) Update D network: maximize log(D(x)) + log(1 - D(G(z)))
+        ###########################
+        ## Train with all-real batch
+        netD.zero_grad()
+        # Format batch
+        real_cpu = data[0].to(device)
+        b_size = real_cpu.size(0)
+        label = torch.full((b_size,), real_label, dtype=torch.float, device=device)
+        # Forward pass real batch through D
+        output = netD(real_cpu).view(-1)
+        # Calculate loss on all-real batch
+        errD_real = criterion(output, label)
+        # Calculate gradients for D in backward pass
+        errD_real.backward()
+        D_x = output.mean().item()
+
+        ## Train with all-fake batch
+        # Generate batch of latent vectors
+        noise = torch.randn(b_size, nz, 1, 1, device=device)
+        # Generate fake image batch with G
+        fake = netG(noise)
+        label.fill_(fake_label)
+        # Classify all fake batch with D
+        output = netD(fake.detach()).view(-1)
+        # Calculate D's loss on the all-fake batch
+        errD_fake = criterion(output, label)
+        # Calculate the gradients for this batch, accumulated (summed) with previous gradients
+        errD_fake.backward()
+        D_G_z1 = output.mean().item()
+        # Compute error of D as sum over the fake and the real batches
+        errD = errD_real + errD_fake
+        # Update D
+        optimizerD.step()
+
+        ############################
+        # (2) Update G network: maximize log(D(G(z)))
+        ###########################
+        netG.zero_grad()
+        label.fill_(real_label)  # fake labels are real for generator cost
+        # Since we just updated D, perform another forward pass of all-fake batch through D
+        output = netD(fake).view(-1)
+        # Calculate G's loss based on this output
+        errG = criterion(output, label)
+        # Calculate gradients for G
+        errG.backward()
+        D_G_z2 = output.mean().item()
+        # Update G
+        optimizerG.step()
+        
+        # Output training stats
+        if i % 50 == 0:
+            print('[%d/%d][%d/%d]\tLoss_D: %.4f\tLoss_G: %.4f\tD(x): %.4f\tD(G(z)): %.4f / %.4f'
+                  % (epoch, num_epochs, i, len(dataloader),
+                     errD.item(), errG.item(), D_x, D_G_z1, D_G_z2))
+        
+        # Save Losses for plotting later
+        G_losses.append(errG.item())
+        D_losses.append(errD.item())
+        
+        # Check how the generator is doing by saving G's output on fixed_noise
+        if (iters % 500 == 0) or ((epoch == num_epochs-1) and (i == len(dataloader)-1)):
+            with torch.no_grad():
+                fake = netG(fixed_noise).detach().cpu()
+            img_list.append(vutils.make_grid(fake, padding=2, normalize=True))
+            
+        iters += 1
+
+
+######################################################################
+# Results
+# -------
+# 
+# Finally, lets check out how we did. Here, we will look at three
+# different results. First, we will see how D and G’s losses changed
+# during training. Second, we will visualize G’s output on the fixed_noise
+# batch for every epoch. And third, we will look at a batch of real data
+# next to a batch of fake data from G.
+# 
+# **Loss versus training iteration**
+# 
+# Below is a plot of D & G’s losses versus training iterations.
+# 
+
+plt.figure(figsize=(10,5))
+plt.title("Generator and Discriminator Loss During Training")
+plt.plot(G_losses,label="G")
+plt.plot(D_losses,label="D")
+plt.xlabel("iterations")
+plt.ylabel("Loss")
+plt.legend()
+plt.show()
+
+
+######################################################################
+# **Visualization of G’s progression**
+# 
+# Remember how we saved the generator’s output on the fixed_noise batch
+# after every epoch of training. Now, we can visualize the training
+# progression of G with an animation. Press the play button to start the
+# animation.
+# 
+
+#%%capture
+fig = plt.figure(figsize=(8,8))
+plt.axis("off")
+ims = [[plt.imshow(np.transpose(i,(1,2,0)), animated=True)] for i in img_list]
+ani = animation.ArtistAnimation(fig, ims, interval=1000, repeat_delay=1000, blit=True)
+
+HTML(ani.to_jshtml())
+
+
+######################################################################
+# **Real Images vs. Fake Images**
+# 
+# Finally, lets take a look at some real images and fake images side by
+# side.
+# 
+
+# Grab a batch of real images from the dataloader
+real_batch = next(iter(dataloader))
+
+# Plot the real images
+plt.figure(figsize=(15,15))
+plt.subplot(1,2,1)
+plt.axis("off")
+plt.title("Real Images")
+plt.imshow(np.transpose(vutils.make_grid(real_batch[0].to(device)[:64], padding=5, normalize=True).cpu(),(1,2,0)))
+
+# Plot the fake images from the last epoch
+plt.subplot(1,2,2)
+plt.axis("off")
+plt.title("Fake Images")
+plt.imshow(np.transpose(img_list[-1],(1,2,0)))
+plt.show()
+
+
+######################################################################
+# Where to Go Next
+# ----------------
+# 
+# We have reached the end of our journey, but there are several places you
+# could go from here. You could:
+# 
+# -  Train for longer to see how good the results get
+# -  Modify this model to take a different dataset and possibly change the
+#    size of the images and the model architecture
+# -  Check out some other cool GAN projects
+#    `here <https://github.com/nashory/gans-awesome-applications>`__
+# -  Create GANs that generate
+#    `music <https://www.deepmind.com/blog/wavenet-a-generative-model-for-raw-audio/>`__
+# 
+
diff --git a/beginner_source/ddp_series_fault_tolerance.rst b/beginner_source/ddp_series_fault_tolerance.rst
new file mode 100644
index 00000000000..27fe7e273e7
--- /dev/null
+++ b/beginner_source/ddp_series_fault_tolerance.rst
@@ -0,0 +1,212 @@
+`Introduction <ddp_series_intro.html>`__ \|\|
+`What is DDP <ddp_series_theory.html>`__ \|\|
+`Single-Node Multi-GPU Training <ddp_series_multigpu.html>`__ \|\|
+**Fault Tolerance** \|\|
+`Multi-Node training <../intermediate/ddp_series_multinode.html>`__ \|\|
+`minGPT Training <../intermediate/ddp_series_minGPT.html>`__
+
+
+Fault-tolerant Distributed Training with ``torchrun``
+=====================================================
+
+Authors: `Suraj Subramanian <https://github.com/subramen>`__
+
+.. grid:: 2
+
+   .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn
+      :class-card: card-prerequisites
+      :margin: 0
+      
+      - Launching multi-GPU training jobs with ``torchrun``
+      - Saving and loading snapshots of your training job
+      - Structuring your training script for graceful restarts
+
+      .. grid:: 1
+
+         .. grid-item::
+
+            :octicon:`code-square;1.0em;` View the code used in this tutorial on `GitHub <https://github.com/pytorch/examples/blob/main/distributed/ddp-tutorial-series/multigpu_torchrun.py>`__
+
+   .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites
+      :class-card: card-prerequisites
+      :margin: 0
+
+      * High-level `overview <ddp_series_theory.html>`__ of DDP
+      * Familiarity with `DDP code <ddp_series_multigpu.html>`__
+      * A machine with multiple GPUs (this tutorial uses an AWS p3.8xlarge instance)
+      * PyTorch `installed <https://pytorch.org/get-started/locally/>`__ with CUDA
+
+Follow along with the video below or on `youtube <https://www.youtube.com/watch/9kIvQOiwYzg>`__.
+
+.. raw:: html
+
+   <div style="margin-top:10px; margin-bottom:10px;">
+     <iframe width="560" height="315" src="https://www.youtube.com/embed/9kIvQOiwYzg" frameborder="0" allow="accelerometer; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
+   </div>
+
+In distributed training, a single process failure can
+disrupt the entire training job. Since the susceptibility for failure can be higher here, making your training
+script robust is particularly important here. You might also prefer your training job to be *elastic*, for example,
+compute resources can join and leave dynamically over the course of the job.
+
+PyTorch offers a utility called ``torchrun`` that provides fault-tolerance and 
+elastic training. When a failure occurs, ``torchrun`` logs the errors and
+attempts to automatically restart all the processes from the last saved
+“snapshot” of the training job. 
+
+The snapshot saves more than just the model state; it can include
+details about the number of epochs run, optimizer states or any other
+stateful attribute of the training job necessary for its continuity.
+
+Why use ``torchrun``
+~~~~~~~~~~~~~~~~~~~~
+
+``torchrun`` handles the minutiae of distributed training so that you
+don't need to. For instance,
+
+-  You don't need to set environment variables or explicitly pass the ``rank`` and ``world_size``; ``torchrun`` assigns this along with several other `environment variables <https://pytorch.org/docs/stable/elastic/run.html#environment-variables>`__.
+-  No need to call ``mp.spawn`` in your script; you only need a generic ``main()`` entry point, and launch the script with ``torchrun``. This way the same script can be run in non-distributed as well as single-node and multinode setups.
+-  Gracefully restarting training from the last saved training snapshot.
+
+
+Graceful restarts
+~~~~~~~~~~~~~~~~~~~~~
+For graceful restarts, you should structure your train script like:
+
+.. code:: python
+
+   def main():
+     load_snapshot(snapshot_path)
+     initialize()
+     train()
+
+   def train():
+     for batch in iter(dataset):
+       train_step(batch)
+
+       if should_checkpoint:
+         save_snapshot(snapshot_path)
+
+If a failure occurs, ``torchrun`` will terminate all the processes and restart them. 
+Each process entry point first loads and initializes the last saved snapshot, and continues training from there.
+So at any failure, you only lose the training progress from the last saved snapshot. 
+
+In elastic training, whenever there are any membership changes (adding or removing nodes), ``torchrun`` will terminate and spawn processes
+on available devices. Having this structure ensures your training job can continue without manual intervention.
+
+
+Diff for `multigpu.py <https://github.com/pytorch/examples/blob/main/distributed/ddp-tutorial-series/multigpu.py>`__ v/s `multigpu_torchrun.py <https://github.com/pytorch/examples/blob/main/distributed/ddp-tutorial-series/multigpu_torchrun.py>`__
+
+Process group initialization
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+-  ``torchrun`` assigns ``RANK`` and ``WORLD_SIZE`` automatically,
+   among `other envvariables <https://pytorch.org/docs/stable/elastic/run.html#environment-variables>`__
+
+.. code-block:: diff
+
+    - def ddp_setup(rank, world_size):
+    + def ddp_setup():
+    -     """
+    -     Args:
+    -         rank: Unique identifier of each process
+    -         world_size: Total number of processes
+    -     """
+    -     os.environ["MASTER_ADDR"] = "localhost"
+    -     os.environ["MASTER_PORT"] = "12355"
+    -     init_process_group(backend="nccl", rank=rank, world_size=world_size)
+    +     init_process_group(backend="nccl")
+         torch.cuda.set_device(int(os.environ["LOCAL_RANK"]))
+
+Use torchrun-provided environment variables
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: diff
+
+    - self.gpu_id = gpu_id
+    + self.gpu_id = int(os.environ["LOCAL_RANK"])
+
+Saving and loading snapshots
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Regularly storing all the relevant information in snapshots allows our
+training job to seamlessly resume after an interruption.
+
+.. code-block:: diff
+
+    + def _save_snapshot(self, epoch):
+    +     snapshot = {}
+    +     snapshot["MODEL_STATE"] = self.model.module.state_dict()
+    +     snapshot["EPOCHS_RUN"] = epoch
+    +     torch.save(snapshot, "snapshot.pt")
+    +     print(f"Epoch {epoch} | Training snapshot saved at snapshot.pt")
+
+    + def _load_snapshot(self, snapshot_path):
+    +     snapshot = torch.load(snapshot_path)
+    +     self.model.load_state_dict(snapshot["MODEL_STATE"])
+    +     self.epochs_run = snapshot["EPOCHS_RUN"]
+    +     print(f"Resuming training from snapshot at Epoch {self.epochs_run}")
+
+
+Loading a snapshot in the Trainer constructor
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+When restarting an interrupted training job, your script will first try
+to load a snapshot to resume training from.
+
+.. code-block:: diff
+
+    class Trainer:
+       def __init__(self, snapshot_path, ...):
+       ...
+    +  if os.path.exists(snapshot_path):
+    +     self._load_snapshot(snapshot_path)
+       ...
+
+
+Resuming training
+~~~~~~~~~~~~~~~~~
+
+Training can resume from the last epoch run, instead of starting all
+over from scratch.
+
+.. code-block:: diff
+
+    def train(self, max_epochs: int):
+    -  for epoch in range(max_epochs):
+    +  for epoch in range(self.epochs_run, max_epochs):
+          self._run_epoch(epoch)
+
+
+Running the script
+~~~~~~~~~~~~~~~~~~
+
+Simply call your entry point function as you would for a non-multiprocessing script; ``torchrun`` automatically
+spawns the processes.
+
+.. code-block:: diff
+
+    if __name__ == "__main__":
+       import sys
+       total_epochs = int(sys.argv[1])
+       save_every = int(sys.argv[2])
+    -  world_size = torch.cuda.device_count()
+    -  mp.spawn(main, args=(world_size, total_epochs, save_every,), nprocs=world_size)
+    +  main(save_every, total_epochs)
+
+
+.. code-block:: diff
+
+    - python multigpu.py 50 10
+    + torchrun --standalone --nproc_per_node=4 multigpu_torchrun.py 50 10
+
+Further Reading
+---------------
+
+-  `Multi-Node training with DDP <../intermediate/ddp_series_multinode.html>`__  (next tutorial in this series)
+-  `Multi-GPU Training with DDP <ddp_series_multigpu.html>`__ (previous tutorial in this series)
+-  `torchrun <https://pytorch.org/docs/stable/elastic/run.html>`__
+-  `Torchrun launch
+   options <https://github.com/pytorch/pytorch/blob/bbe803cb35948df77b46a2d38372910c96693dcd/torch/distributed/run.py#L401>`__
+-  `Migrating from torch.distributed.launch to
+   torchrun <https://pytorch.org/docs/stable/elastic/train_script.html#elastic-train-script>`__
diff --git a/beginner_source/ddp_series_intro.rst b/beginner_source/ddp_series_intro.rst
new file mode 100644
index 00000000000..9aee5d8a5df
--- /dev/null
+++ b/beginner_source/ddp_series_intro.rst
@@ -0,0 +1,56 @@
+**Introduction** \|\| `What is DDP <ddp_series_theory.html>`__ \|\|
+`Single-Node Multi-GPU Training <ddp_series_multigpu.html>`__ \|\|
+`Fault Tolerance <ddp_series_fault_tolerance.html>`__ \|\|
+`Multi-Node training <../intermediate/ddp_series_multinode.html>`__ \|\|
+`minGPT Training <../intermediate/ddp_series_minGPT.html>`__
+
+Distributed Data Parallel in PyTorch - Video Tutorials
+======================================================
+
+Authors: `Suraj Subramanian <https://github.com/subramen>`__
+
+Follow along with the video below or on `youtube <https://www.youtube.com/watch/-K3bZYHYHEA>`__.
+
+.. raw:: html
+
+   <div style="margin-top:10px; margin-bottom:10px;">
+     <iframe width="560" height="315" src="https://www.youtube.com/embed/-K3bZYHYHEA" frameborder="0" allow="accelerometer; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
+   </div>
+
+This series of video tutorials walks you through distributed training in
+PyTorch via DDP.
+
+The series starts with a simple non-distributed training job, and ends
+with deploying a training job across several machines in a cluster.
+Along the way, you will also learn about
+`torchrun <https://pytorch.org/docs/stable/elastic/run.html>`__ for
+fault-tolerant distributed training.
+
+The tutorial assumes a basic familiarity with model training in PyTorch.
+
+Running the code
+----------------
+
+You will need multiple CUDA GPUs to run the tutorial code. Typically,
+this can be done on a cloud instance with multiple GPUs (the tutorials
+use an Amazon EC2 P3 instance with 4 GPUs).
+
+The tutorial code is hosted in this
+`github repo <https://github.com/pytorch/examples/tree/main/distributed/ddp-tutorial-series>`__.
+Clone the repository and follow along!
+
+Tutorial sections
+-----------------
+
+0. Introduction (this page)
+1. `What is DDP? <ddp_series_theory.html>`__ Gently introduces what DDP is doing
+   under the hood
+2. `Single-Node Multi-GPU Training <ddp_series_multigpu.html>`__ Training models
+   using multiple GPUs on a single machine
+3. `Fault-tolerant distributed training <ddp_series_fault_tolerance.html>`__
+   Making your distributed training job robust with torchrun
+4. `Multi-Node training <../intermediate/ddp_series_multinode.html>`__ Training models using
+   multiple GPUs on multiple machines
+5. `Training a GPT model with DDP <../intermediate/ddp_series_minGPT.html>`__ “Real-world”
+   example of training a `minGPT <https://github.com/karpathy/minGPT>`__
+   model with DDP
diff --git a/beginner_source/ddp_series_multigpu.rst b/beginner_source/ddp_series_multigpu.rst
new file mode 100644
index 00000000000..ef6549d4de0
--- /dev/null
+++ b/beginner_source/ddp_series_multigpu.rst
@@ -0,0 +1,229 @@
+`Introduction <ddp_series_intro.html>`__ \|\|
+`What is DDP <ddp_series_theory.html>`__ \|\|
+**Single-Node Multi-GPU Training** \|\|
+`Fault Tolerance <ddp_series_fault_tolerance.html>`__ \|\|
+`Multi-Node training <../intermediate/ddp_series_multinode.html>`__ \|\|
+`minGPT Training <../intermediate/ddp_series_minGPT.html>`__
+
+
+Multi GPU training with DDP
+===========================
+
+Authors: `Suraj Subramanian <https://github.com/subramen>`__
+
+.. grid:: 2
+
+   .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn
+      :class-card: card-prerequisites
+
+      -  How to migrate a single-GPU training script to multi-GPU via DDP
+      -  Setting up the distributed process group
+      -  Saving and loading models in a distributed setup
+
+      .. grid:: 1
+
+         .. grid-item::
+
+            :octicon:`code-square;1.0em;` View the code used in this tutorial on `GitHub <https://github.com/pytorch/examples/blob/main/distributed/ddp-tutorial-series/multigpu.py>`__
+
+   .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites
+      :class-card: card-prerequisites
+
+      * High-level overview of `how DDP works  <ddp_series_theory.html>`__
+      * A machine with multiple GPUs (this tutorial uses an AWS p3.8xlarge instance)
+      * PyTorch `installed <https://pytorch.org/get-started/locally/>`__ with CUDA
+
+Follow along with the video below or on `youtube <https://www.youtube.com/watch/-LAtx9Q6DA8>`__.
+
+.. raw:: html
+
+   <div style="margin-top:10px; margin-bottom:10px;">
+     <iframe width="560" height="315" src="https://www.youtube.com/embed/-LAtx9Q6DA8" frameborder="0" allow="accelerometer; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
+   </div>
+
+In the `previous tutorial <ddp_series_theory.html>`__, we got a high-level overview of how DDP works; now we see how to use DDP in code.
+In this tutorial, we start with a single-GPU training script and migrate that to running it on 4 GPUs on a single node.
+Along the way, we will talk through important concepts in distributed training while implementing them in our code.
+
+.. note::
+   If your model contains any ``BatchNorm`` layers, it needs to be converted to ``SyncBatchNorm`` to sync the running stats of ``BatchNorm``
+   layers across replicas.
+
+   Use the helper function
+   `torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) <https://pytorch.org/docs/stable/generated/torch.nn.SyncBatchNorm.html#torch.nn.SyncBatchNorm.convert_sync_batchnorm>`__ to convert all ``BatchNorm`` layers in the model to ``SyncBatchNorm``.
+
+
+Diff for `single_gpu.py <https://github.com/pytorch/examples/blob/main/distributed/ddp-tutorial-series/single_gpu.py>`__ v/s `multigpu.py <https://github.com/pytorch/examples/blob/main/distributed/ddp-tutorial-series/multigpu.py>`__
+
+These are the changes you typically make to a single-GPU training script to enable DDP.
+
+Imports
+-------
+-  ``torch.multiprocessing`` is a PyTorch wrapper around Python's native
+   multiprocessing
+-  The distributed process group contains all the processes that can
+   communicate and synchronize with each other.
+
+.. code-block:: python
+
+   import torch
+   import torch.nn.functional as F
+   from utils import MyTrainDataset
+
+   import torch.multiprocessing as mp
+   from torch.utils.data.distributed import DistributedSampler
+   from torch.nn.parallel import DistributedDataParallel as DDP
+   from torch.distributed import init_process_group, destroy_process_group
+   import os
+
+
+Constructing the process group
+------------------------------
+
+-  First, before initializing the group process, call `set_device <https://pytorch.org/docs/stable/generated/torch.cuda.set_device.html?highlight=set_device#torch.cuda.set_device>`__,
+   which sets the default GPU for each process. This is important to prevent hangs or excessive memory utilization on `GPU:0`
+-  The process group can be initialized by TCP (default) or from a
+   shared file-system. Read more on `process group
+   initialization <https://pytorch.org/docs/stable/distributed.html#tcp-initialization>`__
+-  `init_process_group <https://pytorch.org/docs/stable/distributed.html?highlight=init_process_group#torch.distributed.init_process_group>`__
+   initializes the distributed process group.
+-  Read more about `choosing a DDP
+   backend <https://pytorch.org/docs/stable/distributed.html#which-backend-to-use>`__
+
+.. code-block:: python
+
+   def ddp_setup(rank: int, world_size: int):
+      """
+      Args:
+          rank: Unique identifier of each process
+         world_size: Total number of processes
+      """
+      os.environ["MASTER_ADDR"] = "localhost"
+      os.environ["MASTER_PORT"] = "12355"
+      torch.cuda.set_device(rank)
+      init_process_group(backend="nccl", rank=rank, world_size=world_size)
+
+
+
+Constructing the DDP model
+--------------------------
+
+.. code-block:: python
+
+   self.model = DDP(model, device_ids=[gpu_id])
+
+Distributing input data
+-----------------------
+
+-  `DistributedSampler <https://pytorch.org/docs/stable/data.html?highlight=distributedsampler#torch.utils.data.distributed.DistributedSampler>`__
+   chunks the input data across all distributed processes.
+- The `DataLoader <https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader>`__ combines a dataset and a
+   sampler, and provides an iterable over the given dataset.
+-  Each process will receive an input batch of 32 samples; the effective
+   batch size is ``32 * nprocs``, or 128 when using 4 GPUs.
+
+.. code-block:: python
+
+    train_data = torch.utils.data.DataLoader(
+        dataset=train_dataset,
+        batch_size=32,
+        shuffle=False,  # We don't shuffle
+        sampler=DistributedSampler(train_dataset), # Use the Distributed Sampler here.
+    )
+
+-  Calling the ``set_epoch()`` method on the ``DistributedSampler`` at the beginning of each epoch is necessary to make shuffling work
+   properly across multiple epochs. Otherwise, the same ordering will be used in each epoch.
+
+.. code-block:: python
+
+    def _run_epoch(self, epoch):
+        b_sz = len(next(iter(self.train_data))[0])
+        self.train_data.sampler.set_epoch(epoch)   # call this additional line at every epoch
+        for source, targets in self.train_data:
+          ...
+          self._run_batch(source, targets)
+
+
+Saving model checkpoints
+------------------------
+-  We only need to save model checkpoints from one process. Without this
+   condition, each process would save its copy of the identical mode. Read
+   more on saving and loading models with
+   DDP `here <https://pytorch.org/tutorials/intermediate/ddp_tutorial.html#save-and-load-checkpoints>`__
+
+.. code-block:: diff
+
+    - ckp = self.model.state_dict()
+    + ckp = self.model.module.state_dict()
+    ...
+    ...
+    - if epoch % self.save_every == 0:
+    + if self.gpu_id == 0 and epoch % self.save_every == 0:
+      self._save_checkpoint(epoch)
+
+.. warning::
+   `Collective calls <https://pytorch.org/docs/stable/distributed.html#collective-functions>`__ are functions that run on all the distributed processes,
+   and they are used to gather certain states or values to a specific process. Collective calls require all ranks to run the collective code.
+   In this example, `_save_checkpoint` should not have any collective calls because it is only run on the ``rank:0`` process.
+   If you need to make any collective calls, it should be before the ``if self.gpu_id == 0`` check.
+
+
+Running the distributed training job
+------------------------------------
+
+-  Include new arguments ``rank`` (replacing ``device``) and
+   ``world_size``.
+-  ``rank`` is auto-allocated by DDP when calling
+   `mp.spawn <https://pytorch.org/docs/stable/multiprocessing.html#spawning-subprocesses>`__.
+-  ``world_size`` is the number of processes across the training job. For GPU training,
+   this corresponds to the number of GPUs in use, and each process works on a dedicated GPU.
+
+.. code-block:: diff
+
+   - def main(device, total_epochs, save_every):
+   + def main(rank, world_size, total_epochs, save_every):
+   +  ddp_setup(rank, world_size)
+      dataset, model, optimizer = load_train_objs()
+      train_data = prepare_dataloader(dataset, batch_size=32)
+   -  trainer = Trainer(model, train_data, optimizer, device, save_every)
+   +  trainer = Trainer(model, train_data, optimizer, rank, save_every)
+      trainer.train(total_epochs)
+   +  destroy_process_group()
+
+   if __name__ == "__main__":
+      import sys
+      total_epochs = int(sys.argv[1])
+      save_every = int(sys.argv[2])
+   -  device = 0      # shorthand for cuda:0
+   -  main(device, total_epochs, save_every)
+   +  world_size = torch.cuda.device_count()
+   +  mp.spawn(main, args=(world_size, total_epochs, save_every,), nprocs=world_size)
+
+Here's what the code looks like:
+
+.. code-block:: python
+   def main(rank, world_size, total_epochs, save_every):
+      ddp_setup(rank, world_size)
+      dataset, model, optimizer = load_train_objs()
+      train_data = prepare_dataloader(dataset, batch_size=32)
+      trainer = Trainer(model, train_data, optimizer, rank, save_every)
+      trainer.train(total_epochs)
+      destroy_process_group()
+
+   if __name__ == "__main__":
+      import sys
+      total_epochs = int(sys.argv[1])
+      save_every = int(sys.argv[2])
+      world_size = torch.cuda.device_count()
+      mp.spawn(main, args=(world_size, total_epochs, save_every,), nprocs=world_size)
+
+
+
+Further Reading
+---------------
+
+-  `Fault Tolerant distributed training <ddp_series_fault_tolerance.html>`__  (next tutorial in this series)
+-  `Intro to DDP <ddp_series_theory.html>`__ (previous tutorial in this series)
+-  `Getting Started with DDP <https://pytorch.org/tutorials/intermediate/ddp_tutorial.html>`__
+-  `Process Group
+   Initialization <https://pytorch.org/docs/stable/distributed.html#tcp-initialization>`__
diff --git a/beginner_source/ddp_series_theory.rst b/beginner_source/ddp_series_theory.rst
new file mode 100644
index 00000000000..ade98d9f01c
--- /dev/null
+++ b/beginner_source/ddp_series_theory.rst
@@ -0,0 +1,74 @@
+`Introduction <ddp_series_intro.html>`__ \|\| **What is DDP** \|\|
+`Single-Node Multi-GPU Training <ddp_series_multigpu.html>`__ \|\|
+`Fault Tolerance <ddp_series_fault_tolerance.html>`__ \|\|
+`Multi-Node training <../intermediate/ddp_series_multinode.html>`__ \|\|
+`minGPT Training <../intermediate/ddp_series_minGPT.html>`__
+
+What is Distributed Data Parallel (DDP)
+=======================================
+
+Authors: `Suraj Subramanian <https://github.com/subramen>`__
+
+.. grid:: 2
+
+   .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn
+      :class-card: card-prerequisites
+
+      *  How DDP works under the hood
+      *  What is ``DistributedSampler``
+      *  How gradients are synchronized across GPUs
+
+
+   .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites
+      :class-card: card-prerequisites
+
+      * Familiarity with `basic non-distributed training  <https://pytorch.org/tutorials/beginner/basics/quickstart_tutorial.html>`__ in PyTorch
+
+Follow along with the video below or on `youtube <https://www.youtube.com/watch/Cvdhwx-OBBo>`__.
+
+.. raw:: html
+
+   <div style="margin-top:10px; margin-bottom:10px;">
+     <iframe width="560" height="315" src="https://www.youtube.com/embed/Cvdhwx-OBBo" frameborder="0" allow="accelerometer; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
+   </div>
+
+This tutorial is a gentle introduction to PyTorch `DistributedDataParallel <https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html>`__ (DDP)
+which enables data parallel training in PyTorch. Data parallelism is a way to
+process multiple data batches across multiple devices simultaneously
+to achieve better performance. In PyTorch, the `DistributedSampler <https://pytorch.org/docs/stable/data.html#torch.utils.data.distributed.DistributedSampler>`__
+ensures each device gets a non-overlapping input batch. The model is replicated on all the devices;
+each replica calculates gradients and simultaneously synchronizes with the others using the `ring all-reduce
+algorithm <https://tech.preferred.jp/en/blog/technologies-behind-distributed-deep-learning-allreduce/>`__.
+
+This `illustrative tutorial <https://pytorch.org/tutorials/intermediate/dist_tuto.html#>`__ provides a more in-depth python view of the mechanics of DDP.
+
+Why you should prefer DDP over ``DataParallel`` (DP)
+----------------------------------------------------
+
+`DataParallel <https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html>`__ 
+is an older approach to data parallelism. DP is trivially simple (with just one extra line of code) but it is much less performant.
+DDP improves upon the architecture in a few ways:
+
++---------------------------------------+------------------------------+
+| ``DataParallel``                      | ``DistributedDataParallel``  |
++=======================================+==============================+
+| More overhead; model is replicated    | Model is replicated only     |
+| and destroyed at each forward pass    | once                         |
++---------------------------------------+------------------------------+
+| Only supports single-node parallelism | Supports scaling to multiple |
+|                                       | machines                     |
++---------------------------------------+------------------------------+
+| Slower; uses multithreading on a      | Faster (no GIL contention)   |
+| single process and runs into Global   | because it uses              |
+| Interpreter Lock (GIL) contention     | multiprocessing              |
++---------------------------------------+------------------------------+
+
+Further Reading
+---------------
+
+-  `Multi-GPU training with DDP <ddp_series_multigpu.html>`__ (next tutorial in this series)
+-  `DDP
+   API <https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html>`__
+-  `DDP Internal
+   Design <https://pytorch.org/docs/master/notes/ddp.html#internal-design>`__
+-  `DDP Mechanics Tutorial <https://pytorch.org/tutorials/intermediate/dist_tuto.html#>`__
diff --git a/beginner_source/deep_learning_60min_blitz.rst b/beginner_source/deep_learning_60min_blitz.rst
index 6b557750f16..6c96c403455 100644
--- a/beginner_source/deep_learning_60min_blitz.rst
+++ b/beginner_source/deep_learning_60min_blitz.rst
@@ -2,20 +2,30 @@ Deep Learning with PyTorch: A 60 Minute Blitz
 ---------------------------------------------
 **Author**: `Soumith Chintala <http://soumith.ch>`_
 
-Goal of this tutorial:
+.. raw:: html
+
+   <div style="margin-top:10px; margin-bottom:10px;">
+     <iframe width="560" height="315" src="https://www.youtube.com/embed/u7x8RXwLKcA" frameborder="0" allow="accelerometer; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
+   </div>
 
--  Understand PyTorch’s Tensor library and neural networks at a high
-   level.
--  Train a small neural network to classify images
+What is PyTorch?
+~~~~~~~~~~~~~~~~~~~~~
+PyTorch is a Python-based scientific computing package serving two broad purposes:
 
-*This tutorial assumes that you have a basic familiarity of numpy*
+-  A replacement for NumPy to use the power of GPUs and other accelerators.
+-  An automatic differentiation library that is useful to implement neural networks.
+
+Goal of this tutorial:
+~~~~~~~~~~~~~~~~~~~~~~~~
+- Understand PyTorch’s Tensor library and neural networks at a high level.
+- Train a small neural network to classify images
 
-.. Note:: 
-    Make sure you have the `torch`_ and `torchvision`_ packages installed.
+To run the tutorials below, make sure you have the `torch`_, `torchvision`_,
+and `matplotlib`_ packages installed.
 
 .. _torch: https://github.com/pytorch/pytorch
 .. _torchvision: https://github.com/pytorch/vision
-
+.. _matplotlib: https://github.com/matplotlib/matplotlib
 
 .. toctree::
    :hidden:
@@ -25,18 +35,33 @@ Goal of this tutorial:
    /beginner/blitz/neural_networks_tutorial
    /beginner/blitz/cifar10_tutorial
 
-.. galleryitem:: /beginner/blitz/tensor_tutorial.py
-    :figure: /_static/img/tensor_illustration_flat.png
+.. grid:: 4
 
-.. galleryitem:: /beginner/blitz/autograd_tutorial.py
-    :figure: /_static/img/Variable.png
+   .. grid-item-card::  :octicon:`file-code;1em` Tensors
+      :link: blitz/tensor_tutorial.html
 
-.. galleryitem:: /beginner/blitz/neural_networks_tutorial.py
-    :figure: /_static/img/mnist.png
+      In this tutorial, you will learn the basics of PyTorch tensors.
+      +++
+      :octicon:`code;1em` Code
 
-.. galleryitem:: /beginner/blitz/cifar10_tutorial.py
-    :figure: /_static/img/cifar10.png
+   .. grid-item-card::  :octicon:`file-code;1em` A Gentle Introduction to torch.autograd
+      :link: blitz/autograd_tutorial.html
 
-.. raw:: html
+      Learn about autograd.
+      +++
+      :octicon:`code;1em` Code
+
+   .. grid-item-card::  :octicon:`file-code;1em` Neural Networks
+      :link: blitz/neural_networks_tutorial.html
+
+      This tutorial demonstrates how you can train neural networks in PyTorch.
+      +++
+      :octicon:`code;1em` Code
+
+   .. grid-item-card::  :octicon:`file-code;1em` Training a Classifier
+      :link: blitz/cifar10_tutorial.html
 
-    <div style='clear:both'></div>
+      Learn how to train an image classifier in PyTorch by using the
+      CIFAR10 dataset.
+      +++
+      :octicon:`code;1em` Code
diff --git a/beginner_source/deep_learning_nlp_tutorial.rst b/beginner_source/deep_learning_nlp_tutorial.rst
deleted file mode 100644
index 61379ff7d3c..00000000000
--- a/beginner_source/deep_learning_nlp_tutorial.rst
+++ /dev/null
@@ -1,56 +0,0 @@
-Deep Learning for NLP with Pytorch
-**********************************
-**Author**: `Robert Guthrie <https://github.com/rguthrie3/DeepLearningForNLPInPytorch>`_
-
-This tutorial will walk you through the key ideas of deep learning
-programming using Pytorch. Many of the concepts (such as the computation
-graph abstraction and autograd) are not unique to Pytorch and are
-relevant to any deep learning tool kit out there.
-
-I am writing this tutorial to focus specifically on NLP for people who
-have never written code in any deep learning framework (e.g, TensorFlow,
-Theano, Keras, Dynet). It assumes working knowledge of core NLP
-problems: part-of-speech tagging, language modeling, etc. It also
-assumes familiarity with neural networks at the level of an intro AI
-class (such as one from the Russel and Norvig book). Usually, these
-courses cover the basic backpropagation algorithm on feed-forward neural
-networks, and make the point that they are chains of compositions of
-linearities and non-linearities. This tutorial aims to get you started
-writing deep learning code, given you have this prerequisite knowledge.
-
-Note this is about *models*, not data. For all of the models, I just
-create a few test examples with small dimensionality so you can see how
-the weights change as it trains. If you have some real data you want to
-try, you should be able to rip out any of the models from this notebook
-and use them on it.
-
-
-.. toctree::
-    :hidden:
-
-    /beginner/nlp/pytorch_tutorial
-    /beginner/nlp/deep_learning_tutorial
-    /beginner/nlp/word_embeddings_tutorial
-    /beginner/nlp/sequence_models_tutorial
-    /beginner/nlp/advanced_tutorial
-
-
-.. galleryitem:: /beginner/nlp/pytorch_tutorial.py
-    :intro: All of deep learning is computations on tensors, which are generalizations of a matrix that can be 
-
-.. galleryitem:: /beginner/nlp/deep_learning_tutorial.py
-    :intro: Deep learning consists of composing linearities with non-linearities in clever ways. The introduction of non-linearities allows
-
-.. galleryitem:: /beginner/nlp/word_embeddings_tutorial.py
-    :intro: Word embeddings are dense vectors of real numbers, one per word in your vocabulary. In NLP, it is almost always the case that your features are
-
-.. galleryitem:: /beginner/nlp/sequence_models_tutorial.py
-    :intro: At this point, we have seen various feed-forward networks. That is, there is no state maintained by the network at all. 
-
-.. galleryitem:: /beginner/nlp/advanced_tutorial.py
-    :intro: Dyanmic versus Static Deep Learning Toolkits. Pytorch is a *dynamic* neural network kit. 
-
-
-.. raw:: html
-
-    <div style='clear:both'></div>
\ No newline at end of file
diff --git a/beginner_source/deeplabv3_on_android.rst b/beginner_source/deeplabv3_on_android.rst
new file mode 100644
index 00000000000..7ec83477373
--- /dev/null
+++ b/beginner_source/deeplabv3_on_android.rst
@@ -0,0 +1,10 @@
+Image Segmentation DeepLabV3 on Android
+=================================================
+
+PyTorch Mobile is no longer actively supported. Please check out `ExecuTorch <https://github.com/pytorch/executorch>`__.
+
+Redirecting in 3 seconds...
+
+.. raw:: html
+
+    <meta http-equiv="Refresh" content="3; url='https://pytorch.org/executorch/stable/index.html'" />
diff --git a/beginner_source/deeplabv3_on_ios.rst b/beginner_source/deeplabv3_on_ios.rst
new file mode 100644
index 00000000000..66c052419fc
--- /dev/null
+++ b/beginner_source/deeplabv3_on_ios.rst
@@ -0,0 +1,10 @@
+Image Segmentation DeepLabV3 on iOS
+==============================================
+
+PyTorch Mobile is no longer actively supported. Please check out `ExecuTorch <https://github.com/pytorch/executorch>`__.
+
+Redirecting in 3 seconds...
+
+.. raw:: html
+
+    <meta http-equiv="Refresh" content="3; url='https://pytorch.org/executorch/stable/index.html'" />
diff --git a/beginner_source/dist_overview.rst b/beginner_source/dist_overview.rst
new file mode 100644
index 00000000000..9088434bf2f
--- /dev/null
+++ b/beginner_source/dist_overview.rst
@@ -0,0 +1,94 @@
+PyTorch Distributed Overview
+============================
+**Author**: `Will Constable <https://github.com/wconstab/>`_, `Wei Feng <https://github.com/weifengpy>`_
+
+.. note::
+   |edit| View and edit this tutorial in `github <https://github.com/pytorch/tutorials/blob/main/beginner_source/dist_overview.rst>`__.
+
+This is the overview page for the ``torch.distributed`` package. The goal of
+this page is to categorize documents into different topics and briefly
+describe each of them. If this is your first time building distributed training
+applications using PyTorch, it is recommended to use this document to navigate
+to the technology that can best serve your use case.
+
+
+Introduction
+------------
+
+The PyTorch Distributed library includes a collective of parallelism modules,
+a communications layer, and infrastructure for launching and
+debugging large training jobs.
+
+
+Parallelism APIs
+****************
+
+These Parallelism Modules offer high-level functionality and compose with existing models:
+
+- `Distributed Data-Parallel (DDP) <https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html>`__
+- `Fully Sharded Data-Parallel Training (FSDP2) <https://pytorch.org/docs/stable/distributed.fsdp.fully_shard.html>`__
+- `Tensor Parallel (TP) <https://pytorch.org/docs/stable/distributed.tensor.parallel.html>`__
+- `Pipeline Parallel (PP) <https://pytorch.org/docs/main/distributed.pipelining.html>`__
+
+Sharding primitives
+*******************
+
+``DTensor`` and ``DeviceMesh`` are primitives used to build parallelism in terms of sharded or replicated tensors on N-dimensional process groups.
+
+- `DTensor <https://github.com/pytorch/pytorch/blob/main/torch/distributed/tensor/README.md>`__ represents a tensor that is sharded and/or replicated, and communicates automatically to reshard tensors as needed by operations.
+- `DeviceMesh <https://pytorch.org/docs/stable/distributed.html#devicemesh>`__ abstracts the accelerator device communicators into a multi-dimensional array, which manages the underlying ``ProcessGroup`` instances for collective communications in multi-dimensional parallelisms.  Try out our `Device Mesh Recipe <https://pytorch.org/tutorials/recipes/distributed_device_mesh.html>`__ to learn more.
+
+Communications APIs
+*******************
+
+The `PyTorch distributed communication layer (C10D) <https://pytorch.org/docs/stable/distributed.html>`__ offers both collective communication APIs (e.g., `all_reduce <https://pytorch.org/docs/stable/distributed.html#torch.distributed.all_reduce>`__
+  and `all_gather <https://pytorch.org/docs/stable/distributed.html#torch.distributed.all_gather>`__)
+  and P2P communication APIs (e.g.,
+  `send <https://pytorch.org/docs/stable/distributed.html#torch.distributed.send>`__
+  and `isend <https://pytorch.org/docs/stable/distributed.html#torch.distributed.isend>`__),
+  which are used under the hood in all of the parallelism implementations.
+  `Writing Distributed Applications with PyTorch <../intermediate/dist_tuto.html>`__
+  shows examples of using c10d communication APIs.
+
+Launcher
+********
+
+`torchrun <https://pytorch.org/docs/stable/elastic/run.html>`__ is a widely-used launcher script, which spawns processes on the local and remote machines for running distributed PyTorch programs.
+
+
+Applying Parallelism To Scale Your Model
+----------------------------------------
+
+Data Parallelism is a widely adopted single-program multiple-data training paradigm
+where the model is replicated on every process, every model replica computes local gradients for
+a different set of input data samples, gradients are averaged within the data-parallel communicator group before each optimizer step.
+
+Model Parallelism techniques (or Sharded Data Parallelism) are required when a model doesn't fit in GPU, and can be combined together to form multi-dimensional (N-D) parallelism techniques.
+
+When deciding what parallelism techniques to choose for your model, use these common guidelines:
+
+#. Use `DistributedDataParallel (DDP) <https://pytorch.org/docs/stable/notes/ddp.html>`__,
+   if your model fits in a single GPU but you want to easily scale up training using multiple GPUs.
+
+   * Use `torchrun <https://pytorch.org/docs/stable/elastic/run.html>`__, to launch multiple pytorch processes if you are using more than one node.
+
+   * See also: `Getting Started with Distributed Data Parallel <../intermediate/ddp_tutorial.html>`__
+
+#. Use `FullyShardedDataParallel (FSDP2) <https://pytorch.org/docs/stable/distributed.fsdp.fully_shard.html>`__ when your model cannot fit on one GPU.
+
+   * See also: `Getting Started with FSDP2 <https://pytorch.org/tutorials/intermediate/FSDP_tutorial.html>`__
+
+#. Use `Tensor Parallel (TP) <https://pytorch.org/docs/stable/distributed.tensor.parallel.html>`__ and/or `Pipeline Parallel (PP) <https://pytorch.org/docs/main/distributed.pipelining.html>`__ if you reach scaling limitations with FSDP2.
+
+   * Try our `Tensor Parallelism Tutorial <https://pytorch.org/tutorials/intermediate/TP_tutorial.html>`__
+
+   * See also: `TorchTitan end to end example of 3D parallelism <https://github.com/pytorch/torchtitan>`__
+
+.. note:: Data-parallel training also works with `Automatic Mixed Precision (AMP) <https://pytorch.org/docs/stable/notes/amp_examples.html#working-with-multiple-gpus>`__.
+
+
+PyTorch Distributed Developers
+------------------------------
+
+If you'd like to contribute to PyTorch Distributed, refer to our
+`Developer Guide <https://github.com/pytorch/pytorch/blob/master/torch/distributed/CONTRIBUTING.md>`_.
diff --git a/beginner_source/examples_autograd/polynomial_autograd.py b/beginner_source/examples_autograd/polynomial_autograd.py
new file mode 100755
index 00000000000..d33ca8bcb90
--- /dev/null
+++ b/beginner_source/examples_autograd/polynomial_autograd.py
@@ -0,0 +1,81 @@
+r"""
+PyTorch: Tensors and autograd
+-------------------------------
+
+A third order polynomial, trained to predict :math:`y=\sin(x)` from :math:`-\pi`
+to :math:`\pi` by minimizing squared Euclidean distance.
+
+This implementation computes the forward pass using operations on PyTorch
+Tensors, and uses PyTorch autograd to compute gradients.
+
+
+A PyTorch Tensor represents a node in a computational graph. If ``x`` is a
+Tensor that has ``x.requires_grad=True`` then ``x.grad`` is another Tensor
+holding the gradient of ``x`` with respect to some scalar value.
+"""
+import torch
+import math
+
+# We want to be able to train our model on an `accelerator <https://pytorch.org/docs/stable/torch.html#accelerators>`__
+# such as CUDA, MPS, MTIA, or XPU. If the current accelerator is available, we will use it. Otherwise, we use the CPU.
+
+dtype = torch.float
+device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu"
+print(f"Using {device} device")
+torch.set_default_device(device)
+
+# Create Tensors to hold input and outputs.
+# By default, requires_grad=False, which indicates that we do not need to
+# compute gradients with respect to these Tensors during the backward pass.
+x = torch.linspace(-1, 1, 2000, dtype=dtype)
+y = torch.exp(x) # A Taylor expansion would be 1 + x + (1/2) x**2 + (1/3!) x**3 + ...
+
+# Create random Tensors for weights. For a third order polynomial, we need
+# 4 weights: y = a + b x + c x^2 + d x^3
+# Setting requires_grad=True indicates that we want to compute gradients with
+# respect to these Tensors during the backward pass.
+a = torch.randn((), dtype=dtype, requires_grad=True)
+b = torch.randn((), dtype=dtype, requires_grad=True)
+c = torch.randn((), dtype=dtype, requires_grad=True)
+d = torch.randn((), dtype=dtype, requires_grad=True)
+
+initial_loss = 1.
+learning_rate = 1e-5
+for t in range(5000):
+    # Forward pass: compute predicted y using operations on Tensors.
+    y_pred = a + b * x + c * x ** 2 + d * x ** 3
+
+    # Compute and print loss using operations on Tensors.
+    # Now loss is a Tensor of shape (1,)
+    # loss.item() gets the scalar value held in the loss.
+    loss = (y_pred - y).pow(2).sum()
+
+    # Calculare initial loss, so we can report loss relative to it
+    if t==0:
+        initial_loss=loss.item()
+
+    if t % 100 == 99:
+        print(f'Iteration t = {t:4d}  loss(t)/loss(0) = {round(loss.item()/initial_loss, 6):10.6f}  a = {a.item():10.6f}  b = {b.item():10.6f}  c = {c.item():10.6f}  d = {d.item():10.6f}')
+
+    # Use autograd to compute the backward pass. This call will compute the
+    # gradient of loss with respect to all Tensors with requires_grad=True.
+    # After this call a.grad, b.grad. c.grad and d.grad will be Tensors holding
+    # the gradient of the loss with respect to a, b, c, d respectively.
+    loss.backward()
+
+    # Manually update weights using gradient descent. Wrap in torch.no_grad()
+    # because weights have requires_grad=True, but we don't need to track this
+    # in autograd.
+    with torch.no_grad():
+        a -= learning_rate * a.grad
+        b -= learning_rate * b.grad
+        c -= learning_rate * c.grad
+        d -= learning_rate * d.grad
+
+        # Manually zero the gradients after updating weights
+        a.grad = None
+        b.grad = None
+        c.grad = None
+        d.grad = None
+
+print(f'Result: y = {a.item()} + {b.item()} x + {c.item()} x^2 + {d.item()} x^3')
diff --git a/beginner_source/examples_autograd/polynomial_custom_function.py b/beginner_source/examples_autograd/polynomial_custom_function.py
new file mode 100755
index 00000000000..39057c8fd7a
--- /dev/null
+++ b/beginner_source/examples_autograd/polynomial_custom_function.py
@@ -0,0 +1,106 @@
+"""
+PyTorch: Defining New autograd Functions
+----------------------------------------
+
+A third order polynomial, trained to predict :math:`y=\sin(x)` from :math:`-\pi`
+to :math:`\pi` by minimizing squared Euclidean distance. Instead of writing the
+polynomial as :math:`y=a+bx+cx^2+dx^3`, we write the polynomial as
+:math:`y=a+b P_3(c+dx)` where :math:`P_3(x)=\\frac{1}{2}\\left(5x^3-3x\\right)` is
+the `Legendre polynomial`_ of degree three.
+
+.. _Legendre polynomial:
+    https://en.wikipedia.org/wiki/Legendre_polynomials
+
+This implementation computes the forward pass using operations on PyTorch
+Tensors, and uses PyTorch autograd to compute gradients.
+
+In this implementation we implement our own custom autograd function to perform
+:math:`P_3'(x)`. By mathematics, :math:`P_3'(x)=\\frac{3}{2}\\left(5x^2-1\\right)`
+"""
+import torch
+import math
+
+
+class LegendrePolynomial3(torch.autograd.Function):
+    """
+    We can implement our own custom autograd Functions by subclassing
+    torch.autograd.Function and implementing the forward and backward passes
+    which operate on Tensors.
+    """
+
+    @staticmethod
+    def forward(ctx, input):
+        """
+        In the forward pass we receive a Tensor containing the input and return
+        a Tensor containing the output. ctx is a context object that can be used
+        to stash information for backward computation. You can cache tensors for
+        use in the backward pass using the ``ctx.save_for_backward`` method. Other
+        objects can be stored directly as attributes on the ctx object, such as
+        ``ctx.my_object = my_object``. Check out `Extending torch.autograd <https://docs.pytorch.org/docs/stable/notes/extending.html#extending-torch-autograd>`_
+        for further details.
+        """
+        ctx.save_for_backward(input)
+        return 0.5 * (5 * input ** 3 - 3 * input)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        """
+        In the backward pass we receive a Tensor containing the gradient of the loss
+        with respect to the output, and we need to compute the gradient of the loss
+        with respect to the input.
+        """
+        input, = ctx.saved_tensors
+        return grad_output * 1.5 * (5 * input ** 2 - 1)
+
+
+dtype = torch.float
+device = torch.device("cpu")
+# device = torch.device("cuda:0")  # Uncomment this to run on GPU
+
+# Create Tensors to hold input and outputs.
+# By default, requires_grad=False, which indicates that we do not need to
+# compute gradients with respect to these Tensors during the backward pass.
+x = torch.linspace(-math.pi, math.pi, 2000, device=device, dtype=dtype)
+y = torch.sin(x)
+
+# Create random Tensors for weights. For this example, we need
+# 4 weights: y = a + b * P3(c + d * x), these weights need to be initialized
+# not too far from the correct result to ensure convergence.
+# Setting requires_grad=True indicates that we want to compute gradients with
+# respect to these Tensors during the backward pass.
+a = torch.full((), 0.0, device=device, dtype=dtype, requires_grad=True)
+b = torch.full((), -1.0, device=device, dtype=dtype, requires_grad=True)
+c = torch.full((), 0.0, device=device, dtype=dtype, requires_grad=True)
+d = torch.full((), 0.3, device=device, dtype=dtype, requires_grad=True)
+
+learning_rate = 5e-6
+for t in range(2000):
+    # To apply our Function, we use Function.apply method. We alias this as 'P3'.
+    P3 = LegendrePolynomial3.apply
+
+    # Forward pass: compute predicted y using operations; we compute
+    # P3 using our custom autograd operation.
+    y_pred = a + b * P3(c + d * x)
+
+    # Compute and print loss
+    loss = (y_pred - y).pow(2).sum()
+    if t % 100 == 99:
+        print(t, loss.item())
+
+    # Use autograd to compute the backward pass.
+    loss.backward()
+
+    # Update weights using gradient descent
+    with torch.no_grad():
+        a -= learning_rate * a.grad
+        b -= learning_rate * b.grad
+        c -= learning_rate * c.grad
+        d -= learning_rate * d.grad
+
+        # Manually zero the gradients after updating weights
+        a.grad = None
+        b.grad = None
+        c.grad = None
+        d.grad = None
+
+print(f'Result: y = {a.item()} + {b.item()} * P3({c.item()} + {d.item()} x)')
diff --git a/beginner_source/examples_autograd/tf_two_layer_net.py b/beginner_source/examples_autograd/tf_two_layer_net.py
deleted file mode 100755
index 6c2a1aa714b..00000000000
--- a/beginner_source/examples_autograd/tf_two_layer_net.py
+++ /dev/null
@@ -1,78 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-TensorFlow: Static Graphs
--------------------------
-
-A fully-connected ReLU network with one hidden layer and no biases, trained to
-predict y from x by minimizing squared Euclidean distance.
-
-This implementation uses basic TensorFlow operations to set up a computational
-graph, then executes the graph many times to actually train the network.
-
-One of the main differences between TensorFlow and PyTorch is that TensorFlow
-uses static computational graphs while PyTorch uses dynamic computational
-graphs.
-
-In TensorFlow we first set up the computational graph, then execute the same
-graph many times.
-"""
-import tensorflow as tf
-import numpy as np
-
-# First we set up the computational graph:
-
-# N is batch size; D_in is input dimension;
-# H is hidden dimension; D_out is output dimension.
-N, D_in, H, D_out = 64, 1000, 100, 10
-
-# Create placeholders for the input and target data; these will be filled
-# with real data when we execute the graph.
-x = tf.placeholder(tf.float32, shape=(None, D_in))
-y = tf.placeholder(tf.float32, shape=(None, D_out))
-
-# Create Variables for the weights and initialize them with random data.
-# A TensorFlow Variable persists its value across executions of the graph.
-w1 = tf.Variable(tf.random_normal((D_in, H)))
-w2 = tf.Variable(tf.random_normal((H, D_out)))
-
-# Forward pass: Compute the predicted y using operations on TensorFlow Tensors.
-# Note that this code does not actually perform any numeric operations; it
-# merely sets up the computational graph that we will later execute.
-h = tf.matmul(x, w1)
-h_relu = tf.maximum(h, tf.zeros(1))
-y_pred = tf.matmul(h_relu, w2)
-
-# Compute loss using operations on TensorFlow Tensors
-loss = tf.reduce_sum((y - y_pred) ** 2.0)
-
-# Compute gradient of the loss with respect to w1 and w2.
-grad_w1, grad_w2 = tf.gradients(loss, [w1, w2])
-
-# Update the weights using gradient descent. To actually update the weights
-# we need to evaluate new_w1 and new_w2 when executing the graph. Note that
-# in TensorFlow the the act of updating the value of the weights is part of
-# the computational graph; in PyTorch this happens outside the computational
-# graph.
-learning_rate = 1e-6
-new_w1 = w1.assign(w1 - learning_rate * grad_w1)
-new_w2 = w2.assign(w2 - learning_rate * grad_w2)
-
-# Now we have built our computational graph, so we enter a TensorFlow session to
-# actually execute the graph.
-with tf.Session() as sess:
-    # Run the graph once to initialize the Variables w1 and w2.
-    sess.run(tf.global_variables_initializer())
-
-    # Create numpy arrays holding the actual data for the inputs x and targets
-    # y
-    x_value = np.random.randn(N, D_in)
-    y_value = np.random.randn(N, D_out)
-    for _ in range(500):
-        # Execute the graph many times. Each time it executes we want to bind
-        # x_value to x and y_value to y, specified with the feed_dict argument.
-        # Each time we execute the graph we want to compute the values for loss,
-        # new_w1, and new_w2; the values of these Tensors are returned as numpy
-        # arrays.
-        loss_value, _, _ = sess.run([loss, new_w1, new_w2],
-                                    feed_dict={x: x_value, y: y_value})
-        print(loss_value)
diff --git a/beginner_source/examples_autograd/two_layer_net_autograd.py b/beginner_source/examples_autograd/two_layer_net_autograd.py
deleted file mode 100755
index ce213242294..00000000000
--- a/beginner_source/examples_autograd/two_layer_net_autograd.py
+++ /dev/null
@@ -1,71 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-PyTorch: Variables and autograd
--------------------------------
-
-A fully-connected ReLU network with one hidden layer and no biases, trained to
-predict y from x by minimizing squared Euclidean distance.
-
-This implementation computes the forward pass using operations on PyTorch
-Variables, and uses PyTorch autograd to compute gradients.
-
-A PyTorch Variable is a wrapper around a PyTorch Tensor, and represents a node
-in a computational graph. If x is a Variable then x.data is a Tensor giving its
-value, and x.grad is another Variable holding the gradient of x with respect to
-some scalar value.
-
-PyTorch Variables have the same API as PyTorch tensors: (almost) any operation
-you can do on a Tensor you can also do on a Variable; the difference is that
-autograd allows you to automatically compute gradients.
-"""
-import torch
-from torch.autograd import Variable
-
-dtype = torch.FloatTensor
-# dtype = torch.cuda.FloatTensor # Uncomment this to run on GPU
-
-# N is batch size; D_in is input dimension;
-# H is hidden dimension; D_out is output dimension.
-N, D_in, H, D_out = 64, 1000, 100, 10
-
-# Create random Tensors to hold input and outputs, and wrap them in Variables.
-# Setting requires_grad=False indicates that we do not need to compute gradients
-# with respect to these Variables during the backward pass.
-x = Variable(torch.randn(N, D_in).type(dtype), requires_grad=False)
-y = Variable(torch.randn(N, D_out).type(dtype), requires_grad=False)
-
-# Create random Tensors for weights, and wrap them in Variables.
-# Setting requires_grad=True indicates that we want to compute gradients with
-# respect to these Variables during the backward pass.
-w1 = Variable(torch.randn(D_in, H).type(dtype), requires_grad=True)
-w2 = Variable(torch.randn(H, D_out).type(dtype), requires_grad=True)
-
-learning_rate = 1e-6
-for t in range(500):
-    # Forward pass: compute predicted y using operations on Variables; these
-    # are exactly the same operations we used to compute the forward pass using
-    # Tensors, but we do not need to keep references to intermediate values since
-    # we are not implementing the backward pass by hand.
-    y_pred = x.mm(w1).clamp(min=0).mm(w2)
-
-    # Compute and print loss using operations on Variables.
-    # Now loss is a Variable of shape (1,) and loss.data is a Tensor of shape
-    # (1,); loss.data[0] is a scalar value holding the loss.
-    loss = (y_pred - y).pow(2).sum()
-    print(t, loss.data[0])
-
-    # Use autograd to compute the backward pass. This call will compute the
-    # gradient of loss with respect to all Variables with requires_grad=True.
-    # After this call w1.grad and w2.grad will be Variables holding the gradient
-    # of the loss with respect to w1 and w2 respectively.
-    loss.backward()
-
-    # Update weights using gradient descent; w1.data and w2.data are Tensors,
-    # w1.grad and w2.grad are Variables and w1.grad.data and w2.grad.data are
-    # Tensors.
-    w1.data -= learning_rate * w1.grad.data
-    w2.data -= learning_rate * w2.grad.data
-
-    # Manually zero the gradients after updating weights
-    w1.grad.data.zero_()
-    w2.grad.data.zero_()
diff --git a/beginner_source/examples_autograd/two_layer_net_custom_function.py b/beginner_source/examples_autograd/two_layer_net_custom_function.py
deleted file mode 100755
index 6d8a652e2bc..00000000000
--- a/beginner_source/examples_autograd/two_layer_net_custom_function.py
+++ /dev/null
@@ -1,84 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-PyTorch: Defining new autograd functions
-----------------------------------------
-
-A fully-connected ReLU network with one hidden layer and no biases, trained to
-predict y from x by minimizing squared Euclidean distance.
-
-This implementation computes the forward pass using operations on PyTorch
-Variables, and uses PyTorch autograd to compute gradients.
-
-In this implementation we implement our own custom autograd function to perform
-the ReLU function.
-"""
-import torch
-from torch.autograd import Variable
-
-
-class MyReLU(torch.autograd.Function):
-    """
-    We can implement our own custom autograd Functions by subclassing
-    torch.autograd.Function and implementing the forward and backward passes
-    which operate on Tensors.
-    """
-
-    def forward(self, input):
-        """
-        In the forward pass we receive a Tensor containing the input and return a
-        Tensor containing the output. You can cache arbitrary Tensors for use in the
-        backward pass using the save_for_backward method.
-        """
-        self.save_for_backward(input)
-        return input.clamp(min=0)
-
-    def backward(self, grad_output):
-        """
-        In the backward pass we receive a Tensor containing the gradient of the loss
-        with respect to the output, and we need to compute the gradient of the loss
-        with respect to the input.
-        """
-        input, = self.saved_tensors
-        grad_input = grad_output.clone()
-        grad_input[input < 0] = 0
-        return grad_input
-
-
-dtype = torch.FloatTensor
-# dtype = torch.cuda.FloatTensor # Uncomment this to run on GPU
-
-# N is batch size; D_in is input dimension;
-# H is hidden dimension; D_out is output dimension.
-N, D_in, H, D_out = 64, 1000, 100, 10
-
-# Create random Tensors to hold input and outputs, and wrap them in Variables.
-x = Variable(torch.randn(N, D_in).type(dtype), requires_grad=False)
-y = Variable(torch.randn(N, D_out).type(dtype), requires_grad=False)
-
-# Create random Tensors for weights, and wrap them in Variables.
-w1 = Variable(torch.randn(D_in, H).type(dtype), requires_grad=True)
-w2 = Variable(torch.randn(H, D_out).type(dtype), requires_grad=True)
-
-learning_rate = 1e-6
-for t in range(500):
-    # Construct an instance of our MyReLU class to use in our network
-    relu = MyReLU()
-
-    # Forward pass: compute predicted y using operations on Variables; we compute
-    # ReLU using our custom autograd operation.
-    y_pred = relu(x.mm(w1)).mm(w2)
-
-    # Compute and print loss
-    loss = (y_pred - y).pow(2).sum()
-    print(t, loss.data[0])
-
-    # Use autograd to compute the backward pass.
-    loss.backward()
-
-    # Update weights using gradient descent
-    w1.data -= learning_rate * w1.grad.data
-    w2.data -= learning_rate * w2.grad.data
-
-    # Manually zero the gradients after updating weights
-    w1.grad.data.zero_()
-    w2.grad.data.zero_()
diff --git a/beginner_source/examples_nn/dynamic_net.py b/beginner_source/examples_nn/dynamic_net.py
index c6f515c00d7..b24ced1cc58 100755
--- a/beginner_source/examples_nn/dynamic_net.py
+++ b/beginner_source/examples_nn/dynamic_net.py
@@ -4,71 +4,74 @@
 --------------------------------------
 
 To showcase the power of PyTorch dynamic graphs, we will implement a very strange
-model: a fully-connected ReLU network that on each forward pass randomly chooses
-a number between 1 and 4 and has that many hidden layers, reusing the same
-weights multiple times to compute the innermost hidden layers.
+model: a third-fifth order polynomial that on each forward pass
+chooses a random number between 4 and 5 and uses that many orders, reusing
+the same weights multiple times to compute the fourth and fifth order.
 """
 import random
 import torch
-from torch.autograd import Variable
+import math
 
 
 class DynamicNet(torch.nn.Module):
-    def __init__(self, D_in, H, D_out):
+    def __init__(self):
         """
-        In the constructor we construct three nn.Linear instances that we will use
-        in the forward pass.
+        In the constructor we instantiate five parameters and assign them as members.
         """
-        super(DynamicNet, self).__init__()
-        self.input_linear = torch.nn.Linear(D_in, H)
-        self.middle_linear = torch.nn.Linear(H, H)
-        self.output_linear = torch.nn.Linear(H, D_out)
+        super().__init__()
+        self.a = torch.nn.Parameter(torch.randn(()))
+        self.b = torch.nn.Parameter(torch.randn(()))
+        self.c = torch.nn.Parameter(torch.randn(()))
+        self.d = torch.nn.Parameter(torch.randn(()))
+        self.e = torch.nn.Parameter(torch.randn(()))
 
     def forward(self, x):
         """
-        For the forward pass of the model, we randomly choose either 0, 1, 2, or 3
-        and reuse the middle_linear Module that many times to compute hidden layer
-        representations.
+        For the forward pass of the model, we randomly choose either 4, 5
+        and reuse the e parameter to compute the contribution of these orders.
 
         Since each forward pass builds a dynamic computation graph, we can use normal
         Python control-flow operators like loops or conditional statements when
         defining the forward pass of the model.
 
-        Here we also see that it is perfectly safe to reuse the same Module many
-        times when defining a computational graph. This is a big improvement from Lua
-        Torch, where each Module could be used only once.
+        Here we also see that it is perfectly safe to reuse the same parameter many
+        times when defining a computational graph.
         """
-        h_relu = self.input_linear(x).clamp(min=0)
-        for _ in range(random.randint(0, 3)):
-            h_relu = self.middle_linear(h_relu).clamp(min=0)
-        y_pred = self.output_linear(h_relu)
-        return y_pred
+        y = self.a + self.b * x + self.c * x ** 2 + self.d * x ** 3
+        for exp in range(4, random.randint(4, 6)):
+            y = y + self.e * x ** exp
+        return y
 
+    def string(self):
+        """
+        Just like any class in Python, you can also define custom method on PyTorch modules
+        """
+        return f'y = {self.a.item()} + {self.b.item()} x + {self.c.item()} x^2 + {self.d.item()} x^3 + {self.e.item()} x^4 ? + {self.e.item()} x^5 ?'
 
-# N is batch size; D_in is input dimension;
-# H is hidden dimension; D_out is output dimension.
-N, D_in, H, D_out = 64, 1000, 100, 10
 
-# Create random Tensors to hold inputs and outputs, and wrap them in Variables
-x = Variable(torch.randn(N, D_in))
-y = Variable(torch.randn(N, D_out), requires_grad=False)
+# Create Tensors to hold input and outputs.
+x = torch.linspace(-math.pi, math.pi, 2000)
+y = torch.sin(x)
 
 # Construct our model by instantiating the class defined above
-model = DynamicNet(D_in, H, D_out)
+model = DynamicNet()
 
 # Construct our loss function and an Optimizer. Training this strange model with
 # vanilla stochastic gradient descent is tough, so we use momentum
-criterion = torch.nn.MSELoss(size_average=False)
-optimizer = torch.optim.SGD(model.parameters(), lr=1e-4, momentum=0.9)
-for t in range(500):
+criterion = torch.nn.MSELoss(reduction='sum')
+optimizer = torch.optim.SGD(model.parameters(), lr=1e-8, momentum=0.9)
+for t in range(30000):
     # Forward pass: Compute predicted y by passing x to the model
     y_pred = model(x)
 
     # Compute and print loss
     loss = criterion(y_pred, y)
-    print(t, loss.data[0])
+    if t % 2000 == 1999:
+        print(t, loss.item())
 
     # Zero gradients, perform a backward pass, and update the weights.
     optimizer.zero_grad()
     loss.backward()
     optimizer.step()
+
+print(f'Result: {model.string()}')
diff --git a/beginner_source/examples_nn/polynomial_module.py b/beginner_source/examples_nn/polynomial_module.py
new file mode 100755
index 00000000000..77b44ae1a5d
--- /dev/null
+++ b/beginner_source/examples_nn/polynomial_module.py
@@ -0,0 +1,70 @@
+# -*- coding: utf-8 -*-
+"""
+PyTorch: Custom nn Modules
+--------------------------
+
+A third order polynomial, trained to predict :math:`y=\sin(x)` from :math:`-\pi`
+to :math:`\pi` by minimizing squared Euclidean distance.
+
+This implementation defines the model as a custom Module subclass. Whenever you
+want a model more complex than a simple sequence of existing Modules you will
+need to define your model this way.
+"""
+import torch
+import math
+
+
+class Polynomial3(torch.nn.Module):
+    def __init__(self):
+        """
+        In the constructor we instantiate four parameters and assign them as
+        member parameters.
+        """
+        super().__init__()
+        self.a = torch.nn.Parameter(torch.randn(()))
+        self.b = torch.nn.Parameter(torch.randn(()))
+        self.c = torch.nn.Parameter(torch.randn(()))
+        self.d = torch.nn.Parameter(torch.randn(()))
+
+    def forward(self, x):
+        """
+        In the forward function we accept a Tensor of input data and we must return
+        a Tensor of output data. We can use Modules defined in the constructor as
+        well as arbitrary operators on Tensors.
+        """
+        return self.a + self.b * x + self.c * x ** 2 + self.d * x ** 3
+
+    def string(self):
+        """
+        Just like any class in Python, you can also define custom method on PyTorch modules
+        """
+        return f'y = {self.a.item()} + {self.b.item()} x + {self.c.item()} x^2 + {self.d.item()} x^3'
+
+
+# Create Tensors to hold input and outputs.
+x = torch.linspace(-math.pi, math.pi, 2000)
+y = torch.sin(x)
+
+# Construct our model by instantiating the class defined above
+model = Polynomial3()
+
+# Construct our loss function and an Optimizer. The call to model.parameters()
+# in the SGD constructor will contain the learnable parameters (defined 
+# with torch.nn.Parameter) which are members of the model.
+criterion = torch.nn.MSELoss(reduction='sum')
+optimizer = torch.optim.SGD(model.parameters(), lr=1e-6)
+for t in range(2000):
+    # Forward pass: Compute predicted y by passing x to the model
+    y_pred = model(x)
+
+    # Compute and print loss
+    loss = criterion(y_pred, y)
+    if t % 100 == 99:
+        print(t, loss.item())
+
+    # Zero gradients, perform a backward pass, and update the weights.
+    optimizer.zero_grad()
+    loss.backward()
+    optimizer.step()
+
+print(f'Result: {model.string()}')
diff --git a/beginner_source/examples_nn/polynomial_nn.py b/beginner_source/examples_nn/polynomial_nn.py
new file mode 100755
index 00000000000..70e281ed365
--- /dev/null
+++ b/beginner_source/examples_nn/polynomial_nn.py
@@ -0,0 +1,84 @@
+# -*- coding: utf-8 -*-
+"""
+PyTorch: nn
+-----------
+
+A third order polynomial, trained to predict :math:`y=\sin(x)` from :math:`-\pi`
+to :math:`\pi` by minimizing squared Euclidean distance.
+
+This implementation uses the nn package from PyTorch to build the network.
+PyTorch autograd makes it easy to define computational graphs and take gradients,
+but raw autograd can be a bit too low-level for defining complex neural networks;
+this is where the nn package can help. The nn package defines a set of Modules,
+which you can think of as a neural network layer that produces output from
+input and may have some trainable weights.
+"""
+import torch
+import math
+
+
+# Create Tensors to hold input and outputs.
+x = torch.linspace(-math.pi, math.pi, 2000)
+y = torch.sin(x)
+
+# For this example, the output y is a linear function of (x, x^2, x^3), so
+# we can consider it as a linear layer neural network. Let's prepare the
+# tensor (x, x^2, x^3).
+p = torch.tensor([1, 2, 3])
+xx = x.unsqueeze(-1).pow(p)
+
+# In the above code, x.unsqueeze(-1) has shape (2000, 1), and p has shape
+# (3,), for this case, broadcasting semantics will apply to obtain a tensor
+# of shape (2000, 3) 
+
+# Use the nn package to define our model as a sequence of layers. nn.Sequential
+# is a Module which contains other Modules, and applies them in sequence to
+# produce its output. The Linear Module computes output from input using a
+# linear function, and holds internal Tensors for its weight and bias.
+# The Flatten layer flatens the output of the linear layer to a 1D tensor,
+# to match the shape of `y`.
+model = torch.nn.Sequential(
+    torch.nn.Linear(3, 1),
+    torch.nn.Flatten(0, 1)
+)
+
+# The nn package also contains definitions of popular loss functions; in this
+# case we will use Mean Squared Error (MSE) as our loss function.
+loss_fn = torch.nn.MSELoss(reduction='sum')
+
+learning_rate = 1e-6
+for t in range(2000):
+
+    # Forward pass: compute predicted y by passing x to the model. Module objects
+    # override the __call__ operator so you can call them like functions. When
+    # doing so you pass a Tensor of input data to the Module and it produces
+    # a Tensor of output data.
+    y_pred = model(xx)
+
+    # Compute and print loss. We pass Tensors containing the predicted and true
+    # values of y, and the loss function returns a Tensor containing the
+    # loss.
+    loss = loss_fn(y_pred, y)
+    if t % 100 == 99:
+        print(t, loss.item())
+
+    # Zero the gradients before running the backward pass.
+    model.zero_grad()
+
+    # Backward pass: compute gradient of the loss with respect to all the learnable
+    # parameters of the model. Internally, the parameters of each Module are stored
+    # in Tensors with requires_grad=True, so this call will compute gradients for
+    # all learnable parameters in the model.
+    loss.backward()
+
+    # Update the weights using gradient descent. Each parameter is a Tensor, so
+    # we can access its gradients like we did before.
+    with torch.no_grad():
+        for param in model.parameters():
+            param -= learning_rate * param.grad
+
+# You can access the first layer of `model` like accessing the first item of a list
+linear_layer = model[0]
+
+# For linear layer, its parameters are stored as `weight` and `bias`.
+print(f'Result: y = {linear_layer.bias.item()} + {linear_layer.weight[:, 0].item()} x + {linear_layer.weight[:, 1].item()} x^2 + {linear_layer.weight[:, 2].item()} x^3')
diff --git a/beginner_source/examples_nn/polynomial_optim.py b/beginner_source/examples_nn/polynomial_optim.py
new file mode 100755
index 00000000000..c0d4896c8f2
--- /dev/null
+++ b/beginner_source/examples_nn/polynomial_optim.py
@@ -0,0 +1,67 @@
+# -*- coding: utf-8 -*-
+"""
+PyTorch: optim
+--------------
+
+A third order polynomial, trained to predict :math:`y=\sin(x)` from :math:`-\pi`
+to :math:`\pi` by minimizing squared Euclidean distance.
+
+This implementation uses the nn package from PyTorch to build the network.
+
+Rather than manually updating the weights of the model as we have been doing,
+we use the optim package to define an Optimizer that will update the weights
+for us. The optim package defines many optimization algorithms that are commonly
+used for deep learning, including SGD+momentum, RMSProp, Adam, etc.
+"""
+import torch
+import math
+
+
+# Create Tensors to hold input and outputs.
+x = torch.linspace(-math.pi, math.pi, 2000)
+y = torch.sin(x)
+
+# Prepare the input tensor (x, x^2, x^3).
+p = torch.tensor([1, 2, 3])
+xx = x.unsqueeze(-1).pow(p)
+
+# Use the nn package to define our model and loss function.
+model = torch.nn.Sequential(
+    torch.nn.Linear(3, 1),
+    torch.nn.Flatten(0, 1)
+)
+loss_fn = torch.nn.MSELoss(reduction='sum')
+
+# Use the optim package to define an Optimizer that will update the weights of
+# the model for us. Here we will use RMSprop; the optim package contains many other
+# optimization algorithms. The first argument to the RMSprop constructor tells the
+# optimizer which Tensors it should update.
+learning_rate = 1e-3
+optimizer = torch.optim.RMSprop(model.parameters(), lr=learning_rate)
+for t in range(2000):
+    # Forward pass: compute predicted y by passing x to the model.
+    y_pred = model(xx)
+
+    # Compute and print loss.
+    loss = loss_fn(y_pred, y)
+    if t % 100 == 99:
+        print(t, loss.item())
+
+    # Before the backward pass, use the optimizer object to zero all of the
+    # gradients for the variables it will update (which are the learnable
+    # weights of the model). This is because by default, gradients are
+    # accumulated in buffers( i.e, not overwritten) whenever .backward()
+    # is called. Checkout docs of torch.autograd.backward for more details.
+    optimizer.zero_grad()
+
+    # Backward pass: compute gradient of the loss with respect to model
+    # parameters
+    loss.backward()
+
+    # Calling the step function on an Optimizer makes an update to its
+    # parameters
+    optimizer.step()
+
+
+linear_layer = model[0]
+print(f'Result: y = {linear_layer.bias.item()} + {linear_layer.weight[:, 0].item()} x + {linear_layer.weight[:, 1].item()} x^2 + {linear_layer.weight[:, 2].item()} x^3')
diff --git a/beginner_source/examples_nn/two_layer_net_module.py b/beginner_source/examples_nn/two_layer_net_module.py
deleted file mode 100755
index c3cb5900c1f..00000000000
--- a/beginner_source/examples_nn/two_layer_net_module.py
+++ /dev/null
@@ -1,65 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-PyTorch: Custom nn Modules
---------------------------
-
-A fully-connected ReLU network with one hidden layer, trained to predict y from x
-by minimizing squared Euclidean distance.
-
-This implementation defines the model as a custom Module subclass. Whenever you
-want a model more complex than a simple sequence of existing Modules you will
-need to define your model this way.
-"""
-import torch
-from torch.autograd import Variable
-
-
-class TwoLayerNet(torch.nn.Module):
-    def __init__(self, D_in, H, D_out):
-        """
-        In the constructor we instantiate two nn.Linear modules and assign them as
-        member variables.
-        """
-        super(TwoLayerNet, self).__init__()
-        self.linear1 = torch.nn.Linear(D_in, H)
-        self.linear2 = torch.nn.Linear(H, D_out)
-
-    def forward(self, x):
-        """
-        In the forward function we accept a Variable of input data and we must return
-        a Variable of output data. We can use Modules defined in the constructor as
-        well as arbitrary operators on Variables.
-        """
-        h_relu = self.linear1(x).clamp(min=0)
-        y_pred = self.linear2(h_relu)
-        return y_pred
-
-
-# N is batch size; D_in is input dimension;
-# H is hidden dimension; D_out is output dimension.
-N, D_in, H, D_out = 64, 1000, 100, 10
-
-# Create random Tensors to hold inputs and outputs, and wrap them in Variables
-x = Variable(torch.randn(N, D_in))
-y = Variable(torch.randn(N, D_out), requires_grad=False)
-
-# Construct our model by instantiating the class defined above
-model = TwoLayerNet(D_in, H, D_out)
-
-# Construct our loss function and an Optimizer. The call to model.parameters()
-# in the SGD constructor will contain the learnable parameters of the two
-# nn.Linear modules which are members of the model.
-criterion = torch.nn.MSELoss(size_average=False)
-optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)
-for t in range(500):
-    # Forward pass: Compute predicted y by passing x to the model
-    y_pred = model(x)
-
-    # Compute and print loss
-    loss = criterion(y_pred, y)
-    print(t, loss.data[0])
-
-    # Zero gradients, perform a backward pass, and update the weights.
-    optimizer.zero_grad()
-    loss.backward()
-    optimizer.step()
diff --git a/beginner_source/examples_nn/two_layer_net_nn.py b/beginner_source/examples_nn/two_layer_net_nn.py
deleted file mode 100755
index 7420ad7fea2..00000000000
--- a/beginner_source/examples_nn/two_layer_net_nn.py
+++ /dev/null
@@ -1,67 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-PyTorch: nn
------------
-
-A fully-connected ReLU network with one hidden layer, trained to predict y from x
-by minimizing squared Euclidean distance.
-
-This implementation uses the nn package from PyTorch to build the network.
-PyTorch autograd makes it easy to define computational graphs and take gradients,
-but raw autograd can be a bit too low-level for defining complex neural networks;
-this is where the nn package can help. The nn package defines a set of Modules,
-which you can think of as a neural network layer that has produces output from
-input and may have some trainable weights.
-"""
-import torch
-from torch.autograd import Variable
-
-# N is batch size; D_in is input dimension;
-# H is hidden dimension; D_out is output dimension.
-N, D_in, H, D_out = 64, 1000, 100, 10
-
-# Create random Tensors to hold inputs and outputs, and wrap them in Variables.
-x = Variable(torch.randn(N, D_in))
-y = Variable(torch.randn(N, D_out), requires_grad=False)
-
-# Use the nn package to define our model as a sequence of layers. nn.Sequential
-# is a Module which contains other Modules, and applies them in sequence to
-# produce its output. Each Linear Module computes output from input using a
-# linear function, and holds internal Variables for its weight and bias.
-model = torch.nn.Sequential(
-    torch.nn.Linear(D_in, H),
-    torch.nn.ReLU(),
-    torch.nn.Linear(H, D_out),
-)
-
-# The nn package also contains definitions of popular loss functions; in this
-# case we will use Mean Squared Error (MSE) as our loss function.
-loss_fn = torch.nn.MSELoss(size_average=False)
-
-learning_rate = 1e-4
-for t in range(500):
-    # Forward pass: compute predicted y by passing x to the model. Module objects
-    # override the __call__ operator so you can call them like functions. When
-    # doing so you pass a Variable of input data to the Module and it produces
-    # a Variable of output data.
-    y_pred = model(x)
-
-    # Compute and print loss. We pass Variables containing the predicted and true
-    # values of y, and the loss function returns a Variable containing the
-    # loss.
-    loss = loss_fn(y_pred, y)
-    print(t, loss.data[0])
-
-    # Zero the gradients before running the backward pass.
-    model.zero_grad()
-
-    # Backward pass: compute gradient of the loss with respect to all the learnable
-    # parameters of the model. Internally, the parameters of each Module are stored
-    # in Variables with requires_grad=True, so this call will compute gradients for
-    # all learnable parameters in the model.
-    loss.backward()
-
-    # Update the weights using gradient descent. Each parameter is a Variable, so
-    # we can access its data and gradients like we did before.
-    for param in model.parameters():
-        param.data -= learning_rate * param.grad.data
diff --git a/beginner_source/examples_nn/two_layer_net_optim.py b/beginner_source/examples_nn/two_layer_net_optim.py
deleted file mode 100755
index b67e24d4893..00000000000
--- a/beginner_source/examples_nn/two_layer_net_optim.py
+++ /dev/null
@@ -1,60 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-PyTorch: optim
---------------
-
-A fully-connected ReLU network with one hidden layer, trained to predict y from x
-by minimizing squared Euclidean distance.
-
-This implementation uses the nn package from PyTorch to build the network.
-
-Rather than manually updating the weights of the model as we have been doing,
-we use the optim package to define an Optimizer that will update the weights
-for us. The optim package defines many optimization algorithms that are commonly
-used for deep learning, including SGD+momentum, RMSProp, Adam, etc.
-"""
-import torch
-from torch.autograd import Variable
-
-# N is batch size; D_in is input dimension;
-# H is hidden dimension; D_out is output dimension.
-N, D_in, H, D_out = 64, 1000, 100, 10
-
-# Create random Tensors to hold inputs and outputs, and wrap them in Variables.
-x = Variable(torch.randn(N, D_in))
-y = Variable(torch.randn(N, D_out), requires_grad=False)
-
-# Use the nn package to define our model and loss function.
-model = torch.nn.Sequential(
-    torch.nn.Linear(D_in, H),
-    torch.nn.ReLU(),
-    torch.nn.Linear(H, D_out),
-)
-loss_fn = torch.nn.MSELoss(size_average=False)
-
-# Use the optim package to define an Optimizer that will update the weights of
-# the model for us. Here we will use Adam; the optim package contains many other
-# optimization algoriths. The first argument to the Adam constructor tells the
-# optimizer which Variables it should update.
-learning_rate = 1e-4
-optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
-for t in range(500):
-    # Forward pass: compute predicted y by passing x to the model.
-    y_pred = model(x)
-
-    # Compute and print loss.
-    loss = loss_fn(y_pred, y)
-    print(t, loss.data[0])
-
-    # Before the backward pass, use the optimizer object to zero all of the
-    # gradients for the variables it will update (which are the learnable weights
-    # of the model)
-    optimizer.zero_grad()
-
-    # Backward pass: compute gradient of the loss with respect to model
-    # parameters
-    loss.backward()
-
-    # Calling the step function on an Optimizer makes an update to its
-    # parameters
-    optimizer.step()
diff --git a/beginner_source/examples_tensor/polynomial_numpy.py b/beginner_source/examples_tensor/polynomial_numpy.py
new file mode 100755
index 00000000000..059ec286ee4
--- /dev/null
+++ b/beginner_source/examples_tensor/polynomial_numpy.py
@@ -0,0 +1,53 @@
+# -*- coding: utf-8 -*-
+"""
+Warm-up: numpy
+--------------
+
+A third order polynomial, trained to predict :math:`y=\sin(x)` from :math:`-\pi`
+to :math:`\pi` by minimizing squared Euclidean distance.
+
+This implementation uses numpy to manually compute the forward pass, loss, and
+backward pass.
+
+A numpy array is a generic n-dimensional array; it does not know anything about
+deep learning or gradients or computational graphs, and is just a way to perform
+generic numeric computations.
+"""
+import numpy as np
+import math
+
+# Create random input and output data
+x = np.linspace(-math.pi, math.pi, 2000)
+y = np.sin(x)
+
+# Randomly initialize weights
+a = np.random.randn()
+b = np.random.randn()
+c = np.random.randn()
+d = np.random.randn()
+
+learning_rate = 1e-6
+for t in range(2000):
+    # Forward pass: compute predicted y
+    # y = a + b x + c x^2 + d x^3
+    y_pred = a + b * x + c * x ** 2 + d * x ** 3
+
+    # Compute and print loss
+    loss = np.square(y_pred - y).sum()
+    if t % 100 == 99:
+        print(t, loss)
+
+    # Backprop to compute gradients of a, b, c, d with respect to loss
+    grad_y_pred = 2.0 * (y_pred - y)
+    grad_a = grad_y_pred.sum()
+    grad_b = (grad_y_pred * x).sum()
+    grad_c = (grad_y_pred * x ** 2).sum()
+    grad_d = (grad_y_pred * x ** 3).sum()
+
+    # Update weights
+    a -= learning_rate * grad_a
+    b -= learning_rate * grad_b
+    c -= learning_rate * grad_c
+    d -= learning_rate * grad_d
+
+print(f'Result: y = {a} + {b} x + {c} x^2 + {d} x^3')
diff --git a/beginner_source/examples_tensor/polynomial_tensor.py b/beginner_source/examples_tensor/polynomial_tensor.py
new file mode 100755
index 00000000000..260cf8d2849
--- /dev/null
+++ b/beginner_source/examples_tensor/polynomial_tensor.py
@@ -0,0 +1,63 @@
+# -*- coding: utf-8 -*-
+"""
+PyTorch: Tensors
+----------------
+
+A third order polynomial, trained to predict :math:`y=\sin(x)` from :math:`-\pi`
+to :math:`\pi` by minimizing squared Euclidean distance.
+
+This implementation uses PyTorch tensors to manually compute the forward pass,
+loss, and backward pass.
+
+A PyTorch Tensor is basically the same as a numpy array: it does not know
+anything about deep learning or computational graphs or gradients, and is just
+a generic n-dimensional array to be used for arbitrary numeric computation.
+
+The biggest difference between a numpy array and a PyTorch Tensor is that
+a PyTorch Tensor can run on either CPU or GPU. To run operations on the GPU,
+just cast the Tensor to a cuda datatype.
+"""
+
+import torch
+import math
+
+
+dtype = torch.float
+device = torch.device("cpu")
+# device = torch.device("cuda:0") # Uncomment this to run on GPU
+
+# Create random input and output data
+x = torch.linspace(-math.pi, math.pi, 2000, device=device, dtype=dtype)
+y = torch.sin(x)
+
+# Randomly initialize weights
+a = torch.randn((), device=device, dtype=dtype)
+b = torch.randn((), device=device, dtype=dtype)
+c = torch.randn((), device=device, dtype=dtype)
+d = torch.randn((), device=device, dtype=dtype)
+
+learning_rate = 1e-6
+for t in range(2000):
+    # Forward pass: compute predicted y
+    y_pred = a + b * x + c * x ** 2 + d * x ** 3
+
+    # Compute and print loss
+    loss = (y_pred - y).pow(2).sum().item()
+    if t % 100 == 99:
+        print(t, loss)
+
+    # Backprop to compute gradients of a, b, c, d with respect to loss
+    grad_y_pred = 2.0 * (y_pred - y)
+    grad_a = grad_y_pred.sum()
+    grad_b = (grad_y_pred * x).sum()
+    grad_c = (grad_y_pred * x ** 2).sum()
+    grad_d = (grad_y_pred * x ** 3).sum()
+
+    # Update weights using gradient descent
+    a -= learning_rate * grad_a
+    b -= learning_rate * grad_b
+    c -= learning_rate * grad_c
+    d -= learning_rate * grad_d
+
+
+print(f'Result: y = {a.item()} + {b.item()} x + {c.item()} x^2 + {d.item()} x^3')
diff --git a/beginner_source/examples_tensor/two_layer_net_numpy.py b/beginner_source/examples_tensor/two_layer_net_numpy.py
deleted file mode 100755
index f003d0f002b..00000000000
--- a/beginner_source/examples_tensor/two_layer_net_numpy.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Warm-up: numpy
---------------
-
-A fully-connected ReLU network with one hidden layer and no biases, trained to
-predict y from x using Euclidean error.
-
-This implementation uses numpy to manually compute the forward pass, loss, and
-backward pass.
-
-A numpy array is a generic n-dimensional array; it does not know anything about
-deep learning or gradients or computational graphs, and is just a way to perform
-generic numeric computations.
-"""
-import numpy as np
-
-# N is batch size; D_in is input dimension;
-# H is hidden dimension; D_out is output dimension.
-N, D_in, H, D_out = 64, 1000, 100, 10
-
-# Create random input and output data
-x = np.random.randn(N, D_in)
-y = np.random.randn(N, D_out)
-
-# Randomly initialize weights
-w1 = np.random.randn(D_in, H)
-w2 = np.random.randn(H, D_out)
-
-learning_rate = 1e-6
-for t in range(500):
-    # Forward pass: compute predicted y
-    h = x.dot(w1)
-    h_relu = np.maximum(h, 0)
-    y_pred = h_relu.dot(w2)
-
-    # Compute and print loss
-    loss = np.square(y_pred - y).sum()
-    print(t, loss)
-
-    # Backprop to compute gradients of w1 and w2 with respect to loss
-    grad_y_pred = 2.0 * (y_pred - y)
-    grad_w2 = h_relu.T.dot(grad_y_pred)
-    grad_h_relu = grad_y_pred.dot(w2.T)
-    grad_h = grad_h_relu.copy()
-    grad_h[h < 0] = 0
-    grad_w1 = x.T.dot(grad_h)
-
-    # Update weights
-    w1 -= learning_rate * grad_w1
-    w2 -= learning_rate * grad_w2
diff --git a/beginner_source/examples_tensor/two_layer_net_tensor.py b/beginner_source/examples_tensor/two_layer_net_tensor.py
deleted file mode 100755
index d0339a49b50..00000000000
--- a/beginner_source/examples_tensor/two_layer_net_tensor.py
+++ /dev/null
@@ -1,60 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-PyTorch: Tensors
-----------------
-
-A fully-connected ReLU network with one hidden layer and no biases, trained to
-predict y from x by minimizing squared Euclidean distance.
-
-This implementation uses PyTorch tensors to manually compute the forward pass,
-loss, and backward pass.
-
-A PyTorch Tensor is basically the same as a numpy array: it does not know
-anything about deep learning or computational graphs or gradients, and is just
-a generic n-dimensional array to be used for arbitrary numeric computation.
-
-The biggest difference between a numpy array and a PyTorch Tensor is that
-a PyTorch Tensor can run on either CPU or GPU. To run operations on the GPU,
-just cast the Tensor to a cuda datatype.
-"""
-
-import torch
-
-
-dtype = torch.FloatTensor
-# dtype = torch.cuda.FloatTensor # Uncomment this to run on GPU
-
-# N is batch size; D_in is input dimension;
-# H is hidden dimension; D_out is output dimension.
-N, D_in, H, D_out = 64, 1000, 100, 10
-
-# Create random input and output data
-x = torch.randn(N, D_in).type(dtype)
-y = torch.randn(N, D_out).type(dtype)
-
-# Randomly initialize weights
-w1 = torch.randn(D_in, H).type(dtype)
-w2 = torch.randn(H, D_out).type(dtype)
-
-learning_rate = 1e-6
-for t in range(500):
-    # Forward pass: compute predicted y
-    h = x.mm(w1)
-    h_relu = h.clamp(min=0)
-    y_pred = h_relu.mm(w2)
-
-    # Compute and print loss
-    loss = (y_pred - y).pow(2).sum()
-    print(t, loss)
-
-    # Backprop to compute gradients of w1 and w2 with respect to loss
-    grad_y_pred = 2.0 * (y_pred - y)
-    grad_w2 = h_relu.t().mm(grad_y_pred)
-    grad_h_relu = grad_y_pred.mm(w2.t())
-    grad_h = grad_h_relu.clone()
-    grad_h[h < 0] = 0
-    grad_w1 = x.t().mm(grad_h)
-
-    # Update weights using gradient descent
-    w1 -= learning_rate * grad_w1
-    w2 -= learning_rate * grad_w2
diff --git a/beginner_source/fgsm_tutorial.py b/beginner_source/fgsm_tutorial.py
new file mode 100644
index 00000000000..a991fe85627
--- /dev/null
+++ b/beginner_source/fgsm_tutorial.py
@@ -0,0 +1,450 @@
+# -*- coding: utf-8 -*-
+"""
+Adversarial Example Generation
+==============================
+
+**Author:** `Nathan Inkawhich <https://github.com/inkawhich>`__
+
+If you are reading this, hopefully you can appreciate how effective some
+machine learning models are. Research is constantly pushing ML models to
+be faster, more accurate, and more efficient. However, an often
+overlooked aspect of designing and training models is security and
+robustness, especially in the face of an adversary who wishes to fool
+the model.
+
+This tutorial will raise your awareness to the security vulnerabilities
+of ML models, and will give insight into the hot topic of adversarial
+machine learning. You may be surprised to find that adding imperceptible
+perturbations to an image *can* cause drastically different model
+performance. Given that this is a tutorial, we will explore the topic
+via example on an image classifier. Specifically, we will use one of the
+first and most popular attack methods, the Fast Gradient Sign Attack
+(FGSM), to fool an MNIST classifier.
+
+"""
+
+
+######################################################################
+# Threat Model
+# ------------
+#
+# For context, there are many categories of adversarial attacks, each with
+# a different goal and assumption of the attacker’s knowledge. However, in
+# general the overarching goal is to add the least amount of perturbation
+# to the input data to cause the desired misclassification. There are
+# several kinds of assumptions of the attacker’s knowledge, two of which
+# are: **white-box** and **black-box**. A *white-box* attack assumes the
+# attacker has full knowledge and access to the model, including
+# architecture, inputs, outputs, and weights. A *black-box* attack assumes
+# the attacker only has access to the inputs and outputs of the model, and
+# knows nothing about the underlying architecture or weights. There are
+# also several types of goals, including **misclassification** and
+# **source/target misclassification**. A goal of *misclassification* means
+# the adversary only wants the output classification to be wrong but does
+# not care what the new classification is. A *source/target
+# misclassification* means the adversary wants to alter an image that is
+# originally of a specific source class so that it is classified as a
+# specific target class.
+#
+# In this case, the FGSM attack is a *white-box* attack with the goal of
+# *misclassification*. With this background information, we can now
+# discuss the attack in detail.
+#
+# Fast Gradient Sign Attack
+# -------------------------
+#
+# One of the first and most popular adversarial attacks to date is
+# referred to as the *Fast Gradient Sign Attack (FGSM)* and is described
+# by Goodfellow et. al. in `Explaining and Harnessing Adversarial
+# Examples <https://arxiv.org/abs/1412.6572>`__. The attack is remarkably
+# powerful, and yet intuitive. It is designed to attack neural networks by
+# leveraging the way they learn, *gradients*. The idea is simple, rather
+# than working to minimize the loss by adjusting the weights based on the
+# backpropagated gradients, the attack *adjusts the input data to maximize
+# the loss* based on the same backpropagated gradients. In other words,
+# the attack uses the gradient of the loss w.r.t the input data, then
+# adjusts the input data to maximize the loss.
+#
+# Before we jump into the code, let’s look at the famous
+# `FGSM <https://arxiv.org/abs/1412.6572>`__ panda example and extract
+# some notation.
+#
+# .. figure:: /_static/img/fgsm_panda_image.png
+#    :alt: fgsm_panda_image
+#
+# From the figure, :math:`\mathbf{x}` is the original input image
+# correctly classified as a “panda”, :math:`y` is the ground truth label
+# for :math:`\mathbf{x}`, :math:`\mathbf{\theta}` represents the model
+# parameters, and :math:`J(\mathbf{\theta}, \mathbf{x}, y)` is the loss
+# that is used to train the network. The attack backpropagates the
+# gradient back to the input data to calculate
+# :math:`\nabla_{x} J(\mathbf{\theta}, \mathbf{x}, y)`. Then, it adjusts
+# the input data by a small step (:math:`\epsilon` or :math:`0.007` in the
+# picture) in the direction (i.e.
+# :math:`sign(\nabla_{x} J(\mathbf{\theta}, \mathbf{x}, y))`) that will
+# maximize the loss. The resulting perturbed image, :math:`x'`, is then
+# *misclassified* by the target network as a “gibbon” when it is still
+# clearly a “panda”.
+#
+# Hopefully now the motivation for this tutorial is clear, so lets jump
+# into the implementation.
+#
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+from torchvision import datasets, transforms
+import numpy as np
+import matplotlib.pyplot as plt
+
+
+######################################################################
+# Implementation
+# --------------
+#
+# In this section, we will discuss the input parameters for the tutorial,
+# define the model under attack, then code the attack and run some tests.
+#
+# Inputs
+# ~~~~~~
+#
+# There are only three inputs for this tutorial, and are defined as
+# follows:
+#
+# -  ``epsilons`` - List of epsilon values to use for the run. It is
+#    important to keep 0 in the list because it represents the model
+#    performance on the original test set. Also, intuitively we would
+#    expect the larger the epsilon, the more noticeable the perturbations
+#    but the more effective the attack in terms of degrading model
+#    accuracy. Since the data range here is :math:`[0,1]`, no epsilon
+#    value should exceed 1.
+#
+# -  ``pretrained_model`` - path to the pretrained MNIST model which was
+#    trained with
+#    `pytorch/examples/mnist <https://github.com/pytorch/examples/tree/master/mnist>`__.
+#    For simplicity, download the pretrained model `here <https://drive.google.com/file/d/1HJV2nUHJqclXQ8flKvcWmjZ-OU5DGatl/view?usp=drive_link>`__.
+#
+
+epsilons = [0, .05, .1, .15, .2, .25, .3]
+pretrained_model = "data/lenet_mnist_model.pth"
+# Set random seed for reproducibility
+torch.manual_seed(42)
+
+
+######################################################################
+# Model Under Attack
+# ~~~~~~~~~~~~~~~~~~
+#
+# As mentioned, the model under attack is the same MNIST model from
+# `pytorch/examples/mnist <https://github.com/pytorch/examples/tree/master/mnist>`__.
+# You may train and save your own MNIST model or you can download and use
+# the provided model. The *Net* definition and test dataloader here have
+# been copied from the MNIST example. The purpose of this section is to
+# define the model and dataloader, then initialize the model and load the
+# pretrained weights.
+#
+
+# LeNet Model definition
+class Net(nn.Module):
+    def __init__(self):
+        super(Net, self).__init__()
+        self.conv1 = nn.Conv2d(1, 32, 3, 1)
+        self.conv2 = nn.Conv2d(32, 64, 3, 1)
+        self.dropout1 = nn.Dropout(0.25)
+        self.dropout2 = nn.Dropout(0.5)
+        self.fc1 = nn.Linear(9216, 128)
+        self.fc2 = nn.Linear(128, 10)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = F.relu(x)
+        x = self.conv2(x)
+        x = F.relu(x)
+        x = F.max_pool2d(x, 2)
+        x = self.dropout1(x)
+        x = torch.flatten(x, 1)
+        x = self.fc1(x)
+        x = F.relu(x)
+        x = self.dropout2(x)
+        x = self.fc2(x)
+        output = F.log_softmax(x, dim=1)
+        return output
+
+# MNIST Test dataset and dataloader declaration
+test_loader = torch.utils.data.DataLoader(
+    datasets.MNIST('../data', train=False, download=True, transform=transforms.Compose([
+            transforms.ToTensor(),
+            transforms.Normalize((0.1307,), (0.3081,)),
+            ])),
+        batch_size=1, shuffle=True)
+
+# We want to be able to train our model on an `accelerator <https://pytorch.org/docs/stable/torch.html#accelerators>`__
+# such as CUDA, MPS, MTIA, or XPU. If the current accelerator is available, we will use it. Otherwise, we use the CPU.
+device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu"
+print(f"Using {device} device")
+
+# Initialize the network
+model = Net().to(device)
+
+# Load the pretrained model
+model.load_state_dict(torch.load(pretrained_model, map_location=device, weights_only=True))
+
+# Set the model in evaluation mode. In this case this is for the Dropout layers
+model.eval()
+
+
+######################################################################
+# FGSM Attack
+# ~~~~~~~~~~~
+#
+# Now, we can define the function that creates the adversarial examples by
+# perturbing the original inputs. The ``fgsm_attack`` function takes three
+# inputs, *image* is the original clean image (:math:`x`), *epsilon* is
+# the pixel-wise perturbation amount (:math:`\epsilon`), and *data_grad*
+# is gradient of the loss w.r.t the input image
+# (:math:`\nabla_{x} J(\mathbf{\theta}, \mathbf{x}, y)`). The function
+# then creates perturbed image as
+#
+# .. math:: perturbed\_image = image + epsilon*sign(data\_grad) = x + \epsilon * sign(\nabla_{x} J(\mathbf{\theta}, \mathbf{x}, y))
+#
+# Finally, in order to maintain the original range of the data, the
+# perturbed image is clipped to range :math:`[0,1]`.
+#
+
+# FGSM attack code
+def fgsm_attack(image, epsilon, data_grad):
+    # Collect the element-wise sign of the data gradient
+    sign_data_grad = data_grad.sign()
+    # Create the perturbed image by adjusting each pixel of the input image
+    perturbed_image = image + epsilon*sign_data_grad
+    # Adding clipping to maintain [0,1] range
+    perturbed_image = torch.clamp(perturbed_image, 0, 1)
+    # Return the perturbed image
+    return perturbed_image
+
+# restores the tensors to their original scale
+def denorm(batch, mean=[0.1307], std=[0.3081]):
+    """
+    Convert a batch of tensors to their original scale.
+
+    Args:
+        batch (torch.Tensor): Batch of normalized tensors.
+        mean (torch.Tensor or list): Mean used for normalization.
+        std (torch.Tensor or list): Standard deviation used for normalization.
+
+    Returns:
+        torch.Tensor: batch of tensors without normalization applied to them.
+    """
+    if isinstance(mean, list):
+        mean = torch.tensor(mean).to(device)
+    if isinstance(std, list):
+        std = torch.tensor(std).to(device)
+
+    return batch * std.view(1, -1, 1, 1) + mean.view(1, -1, 1, 1)
+
+
+######################################################################
+# Testing Function
+# ~~~~~~~~~~~~~~~~
+#
+# Finally, the central result of this tutorial comes from the ``test``
+# function. Each call to this test function performs a full test step on
+# the MNIST test set and reports a final accuracy. However, notice that
+# this function also takes an *epsilon* input. This is because the
+# ``test`` function reports the accuracy of a model that is under attack
+# from an adversary with strength :math:`\epsilon`. More specifically, for
+# each sample in the test set, the function computes the gradient of the
+# loss w.r.t the input data (:math:`data\_grad`), creates a perturbed
+# image with ``fgsm_attack`` (:math:`perturbed\_data`), then checks to see
+# if the perturbed example is adversarial. In addition to testing the
+# accuracy of the model, the function also saves and returns some
+# successful adversarial examples to be visualized later.
+#
+
+def test( model, device, test_loader, epsilon ):
+
+    # Accuracy counter
+    correct = 0
+    adv_examples = []
+
+    # Loop over all examples in test set
+    for data, target in test_loader:
+
+        # Send the data and label to the device
+        data, target = data.to(device), target.to(device)
+
+        # Set requires_grad attribute of tensor. Important for Attack
+        data.requires_grad = True
+
+        # Forward pass the data through the model
+        output = model(data)
+        init_pred = output.max(1, keepdim=True)[1] # get the index of the max log-probability
+
+        # If the initial prediction is wrong, don't bother attacking, just move on
+        if init_pred.item() != target.item():
+            continue
+
+        # Calculate the loss
+        loss = F.nll_loss(output, target)
+
+        # Zero all existing gradients
+        model.zero_grad()
+
+        # Calculate gradients of model in backward pass
+        loss.backward()
+
+        # Collect ``datagrad``
+        data_grad = data.grad.data
+
+        # Restore the data to its original scale
+        data_denorm = denorm(data)
+
+        # Call FGSM Attack
+        perturbed_data = fgsm_attack(data_denorm, epsilon, data_grad)
+
+        # Reapply normalization
+        perturbed_data_normalized = transforms.Normalize((0.1307,), (0.3081,))(perturbed_data)
+
+        # Re-classify the perturbed image
+        output = model(perturbed_data_normalized)
+
+        # Check for success
+        final_pred = output.max(1, keepdim=True)[1] # get the index of the max log-probability
+        if final_pred.item() == target.item():
+            correct += 1
+            # Special case for saving 0 epsilon examples
+            if epsilon == 0 and len(adv_examples) < 5:
+                adv_ex = perturbed_data.squeeze().detach().cpu().numpy()
+                adv_examples.append( (init_pred.item(), final_pred.item(), adv_ex) )
+        else:
+            # Save some adv examples for visualization later
+            if len(adv_examples) < 5:
+                adv_ex = perturbed_data.squeeze().detach().cpu().numpy()
+                adv_examples.append( (init_pred.item(), final_pred.item(), adv_ex) )
+
+    # Calculate final accuracy for this epsilon
+    final_acc = correct/float(len(test_loader))
+    print(f"Epsilon: {epsilon}\tTest Accuracy = {correct} / {len(test_loader)} = {final_acc}")
+
+    # Return the accuracy and an adversarial example
+    return final_acc, adv_examples
+
+
+######################################################################
+# Run Attack
+# ~~~~~~~~~~
+#
+# The last part of the implementation is to actually run the attack. Here,
+# we run a full test step for each epsilon value in the *epsilons* input.
+# For each epsilon we also save the final accuracy and some successful
+# adversarial examples to be plotted in the coming sections. Notice how
+# the printed accuracies decrease as the epsilon value increases. Also,
+# note the :math:`\epsilon=0` case represents the original test accuracy,
+# with no attack.
+#
+
+accuracies = []
+examples = []
+
+# Run test for each epsilon
+for eps in epsilons:
+    acc, ex = test(model, device, test_loader, eps)
+    accuracies.append(acc)
+    examples.append(ex)
+
+
+######################################################################
+# Results
+# -------
+#
+# Accuracy vs Epsilon
+# ~~~~~~~~~~~~~~~~~~~
+#
+# The first result is the accuracy versus epsilon plot. As alluded to
+# earlier, as epsilon increases we expect the test accuracy to decrease.
+# This is because larger epsilons mean we take a larger step in the
+# direction that will maximize the loss. Notice the trend in the curve is
+# not linear even though the epsilon values are linearly spaced. For
+# example, the accuracy at :math:`\epsilon=0.05` is only about 4% lower
+# than :math:`\epsilon=0`, but the accuracy at :math:`\epsilon=0.2` is 25%
+# lower than :math:`\epsilon=0.15`. Also, notice the accuracy of the model
+# hits random accuracy for a 10-class classifier between
+# :math:`\epsilon=0.25` and :math:`\epsilon=0.3`.
+#
+
+plt.figure(figsize=(5,5))
+plt.plot(epsilons, accuracies, "*-")
+plt.yticks(np.arange(0, 1.1, step=0.1))
+plt.xticks(np.arange(0, .35, step=0.05))
+plt.title("Accuracy vs Epsilon")
+plt.xlabel("Epsilon")
+plt.ylabel("Accuracy")
+plt.show()
+
+
+######################################################################
+# Sample Adversarial Examples
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# Remember the idea of no free lunch? In this case, as epsilon increases
+# the test accuracy decreases **BUT** the perturbations become more easily
+# perceptible. In reality, there is a tradeoff between accuracy
+# degradation and perceptibility that an attacker must consider. Here, we
+# show some examples of successful adversarial examples at each epsilon
+# value. Each row of the plot shows a different epsilon value. The first
+# row is the :math:`\epsilon=0` examples which represent the original
+# “clean” images with no perturbation. The title of each image shows the
+# “original classification -> adversarial classification.” Notice, the
+# perturbations start to become evident at :math:`\epsilon=0.15` and are
+# quite evident at :math:`\epsilon=0.3`. However, in all cases humans are
+# still capable of identifying the correct class despite the added noise.
+#
+
+# Plot several examples of adversarial samples at each epsilon
+cnt = 0
+plt.figure(figsize=(8,10))
+for i in range(len(epsilons)):
+    for j in range(len(examples[i])):
+        cnt += 1
+        plt.subplot(len(epsilons),len(examples[0]),cnt)
+        plt.xticks([], [])
+        plt.yticks([], [])
+        if j == 0:
+            plt.ylabel(f"Eps: {epsilons[i]}", fontsize=14)
+        orig,adv,ex = examples[i][j]
+        plt.title(f"{orig} -> {adv}")
+        plt.imshow(ex, cmap="gray")
+plt.tight_layout()
+plt.show()
+
+
+######################################################################
+# Where to go next?
+# -----------------
+#
+# Hopefully this tutorial gives some insight into the topic of adversarial
+# machine learning. There are many potential directions to go from here.
+# This attack represents the very beginning of adversarial attack research
+# and since there have been many subsequent ideas for how to attack and
+# defend ML models from an adversary. In fact, at NIPS 2017 there was an
+# adversarial attack and defense competition and many of the methods used
+# in the competition are described in this paper: `Adversarial Attacks and
+# Defences Competition <https://arxiv.org/pdf/1804.00097.pdf>`__. The work
+# on defense also leads into the idea of making machine learning models
+# more *robust* in general, to both naturally perturbed and adversarially
+# crafted inputs.
+#
+# Another direction to go is adversarial attacks and defense in different
+# domains. Adversarial research is not limited to the image domain, check
+# out `this <https://arxiv.org/pdf/1801.01944.pdf>`__ attack on
+# speech-to-text models. But perhaps the best way to learn more about
+# adversarial machine learning is to get your hands dirty. Try to
+# implement a different attack from the NIPS 2017 competition, and see how
+# it differs from FGSM. Then, try to defend the model from your own
+# attacks.
+#
+# A further direction to go, depending on available resources, is to modify
+# the code to support processing work in batch, in parallel, and or distributed
+# vs working on one attack at a time in the above for each ``epsilon test()`` loop.
+#
diff --git a/beginner_source/finetuning_torchvision_models_tutorial.rst b/beginner_source/finetuning_torchvision_models_tutorial.rst
new file mode 100644
index 00000000000..711f4b0f99b
--- /dev/null
+++ b/beginner_source/finetuning_torchvision_models_tutorial.rst
@@ -0,0 +1,10 @@
+Finetuning Torchvision Models
+=============================
+
+This tutorial has been moved to https://pytorch.org/tutorials/intermediate/torchvision_tutorial.html
+
+It will redirect in 3 seconds.
+
+.. raw:: html
+
+   <meta http-equiv="Refresh" content="3; url='https://pytorch.org/tutorials/intermediate/torchvision_tutorial.html'" />
diff --git a/beginner_source/former_torchies/README.txt b/beginner_source/former_torchies/README.txt
deleted file mode 100644
index ce45297e0fa..00000000000
--- a/beginner_source/former_torchies/README.txt
+++ /dev/null
@@ -1,18 +0,0 @@
- PyTorch for former Torch users
- ------------------------------
- 
-1. tensor_tutorial.py
-	Tensors
-	http://pytorch.org/tutorials/beginner/former_torchies/tensor_tutorial.html
-
-2. autograd.py
-	Autograd
-	http://pytorch.org/tutorials/beginner/former_torchies/autograd_tutorial.html
-
-3. nn_tutorial.py
-	nn package
-	http://pytorch.org/tutorials/beginner/former_torchies/nn_tutorial.html
-
-4. parallelism_tutorial.py
-	Multi-GPU examples
-	http://pytorch.org/tutorials/beginner/former_torchies/parallelism_tutorial.html
diff --git a/beginner_source/former_torchies/autograd_tutorial.py b/beginner_source/former_torchies/autograd_tutorial.py
deleted file mode 100644
index 7ed1feaa4cf..00000000000
--- a/beginner_source/former_torchies/autograd_tutorial.py
+++ /dev/null
@@ -1,113 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Autograd
-========
-
-Autograd is now a core torch package for automatic differentiation.
-It uses a tape based system for automatic differentiation.
-
-In the forward phase, the autograd tape will remember all the operations
-it executed, and in the backward phase, it will replay the operations.
-
-Variable
---------
-
-In autograd, we introduce a ``Variable`` class, which is a very thin
-wrapper around a ``Tensor``. You can access the raw tensor through the
-``.data`` attribute, and after computing the backward pass, a gradient
-w.r.t. this variable is accumulated into ``.grad`` attribute.
-
-.. figure:: /_static/img/Variable.png
-   :alt: Variable
-
-   Variable
-
-There’s one more class which is very important for autograd
-implementation - a ``Function``. ``Variable`` and ``Function`` are
-interconnected and build up an acyclic graph, that encodes a complete
-history of computation. Each variable has a ``.grad_fn`` attribute that
-references a function that has created a function (except for Variables
-created by the user - these have ``None`` as ``.grad_fn``).
-
-If you want to compute the derivatives, you can call ``.backward()`` on
-a ``Variable``. If ``Variable`` is a scalar (i.e. it holds a one element
-tensor), you don’t need to specify any arguments to ``backward()``,
-however if it has more elements, you need to specify a ``grad_output``
-argument that is a tensor of matching shape.
-"""
-
-import torch
-from torch.autograd import Variable
-x = Variable(torch.ones(2, 2), requires_grad=True)
-print(x)  # notice the "Variable containing" line
-
-###############################################################
-#
-print(x.data)
-
-###############################################################
-#
-print(x.grad)
-
-###############################################################
-#
-
-print(x.grad_fn)  # we've created x ourselves
-
-###############################################################
-# Do an operation of x:
-
-y = x + 2
-print(y)
-
-###############################################################
-# y was created as a result of an operation,
-# so it has a grad_fn
-print(y.grad_fn)
-
-###############################################################
-# More operations on y:
-
-z = y * y * 3
-out = z.mean()
-
-print(z, out)
-
-###############################################################
-# Gradients
-# ---------
-#
-# let's backprop now and print gradients d(out)/dx
-
-out.backward()
-print(x.grad)
-
-
-###############################################################
-# By default, gradient computation flushes all the internal buffers
-# contained in the graph, so if you even want to do the backward on some
-# part of the graph twice, you need to pass in ``retain_variables = True``
-# during the first pass.
-
-x = Variable(torch.ones(2, 2), requires_grad=True)
-y = x + 2
-y.backward(torch.ones(2, 2), retain_graph=True)
-# the retain_variables flag will prevent the internal buffers from being freed
-print(x.grad)
-
-###############################################################
-#
-z = y * y
-print(z)
-
-###############################################################
-#
-# just backprop random gradients
-
-gradient = torch.randn(2, 2)
-
-# this would fail if we didn't specify
-# that we want to retain variables
-y.backward(gradient)
-
-print(x.grad)
diff --git a/beginner_source/former_torchies/autograd_tutorial_old.rst b/beginner_source/former_torchies/autograd_tutorial_old.rst
new file mode 100644
index 00000000000..8c887e00c8a
--- /dev/null
+++ b/beginner_source/former_torchies/autograd_tutorial_old.rst
@@ -0,0 +1,8 @@
+Autograd
+==============
+
+This tutorial is out of date. You'll be redirected to the new tutorial in 3 seconds: https://pytorch.org/tutorials/beginner/basics/autogradqs_tutorial.html
+
+.. raw:: html
+
+   <meta http-equiv="Refresh" content="3; url='https://pytorch.org/tutorials/beginner/basics/autogradqs_tutorial.html'" />
diff --git a/beginner_source/former_torchies/nn_tutorial.py b/beginner_source/former_torchies/nn_tutorial.py
deleted file mode 100644
index 0df7ec49625..00000000000
--- a/beginner_source/former_torchies/nn_tutorial.py
+++ /dev/null
@@ -1,267 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-nn package
-==========
-
-We’ve redesigned the nn package, so that it’s fully integrated with
-autograd. Let's review the changes.
-
-**Replace containers with autograd:**
-
-    You no longer have to use Containers like ``ConcatTable``, or modules like
-    ``CAddTable``, or use and debug with nngraph. We will seamlessly use
-    autograd to define our neural networks. For example,
-
-    * ``output = nn.CAddTable():forward({input1, input2})`` simply becomes
-      ``output = input1 + input2``
-    * ``output = nn.MulConstant(0.5):forward(input)`` simply becomes
-      ``output = input * 0.5``
-
-**State is no longer held in the module, but in the network graph:**
-
-    Using recurrent networks should be simpler because of this reason. If
-    you want to create a recurrent network, simply use the same Linear layer
-    multiple times, without having to think about sharing weights.
-
-    .. figure:: /_static/img/torch-nn-vs-pytorch-nn.png
-       :alt: torch-nn-vs-pytorch-nn
-
-       torch-nn-vs-pytorch-nn
-
-**Simplified debugging:**
-
-    Debugging is intuitive using Python’s pdb debugger, and **the debugger
-    and stack traces stop at exactly where an error occurred.** What you see
-    is what you get.
-
-Example 1: ConvNet
-------------------
-
-Let’s see how to create a small ConvNet.
-
-All of your networks are derived from the base class ``nn.Module``:
-
--  In the constructor, you declare all the layers you want to use.
--  In the forward function, you define how your model is going to be
-   run, from input to output
-"""
-
-import torch
-from torch.autograd import Variable
-import torch.nn as nn
-import torch.nn.functional as F
-
-
-class MNISTConvNet(nn.Module):
-
-    def __init__(self):
-        # this is the place where you instantiate all your modules
-        # you can later access them using the same names you've given them in
-        # here
-        super(MNISTConvNet, self).__init__()
-        self.conv1 = nn.Conv2d(1, 10, 5)
-        self.pool1 = nn.MaxPool2d(2, 2)
-        self.conv2 = nn.Conv2d(10, 20, 5)
-        self.pool2 = nn.MaxPool2d(2, 2)
-        self.fc1 = nn.Linear(320, 50)
-        self.fc2 = nn.Linear(50, 10)
-
-    # it's the forward function that defines the network structure
-    # we're accepting only a single input in here, but if you want,
-    # feel free to use more
-    def forward(self, input):
-        x = self.pool1(F.relu(self.conv1(input)))
-        x = self.pool2(F.relu(self.conv2(x)))
-
-        # in your model definition you can go full crazy and use arbitrary
-        # python code to define your model structure
-        # all these are perfectly legal, and will be handled correctly
-        # by autograd:
-        # if x.gt(0) > x.numel() / 2:
-        #      ...
-        #
-        # you can even do a loop and reuse the same module inside it
-        # modules no longer hold ephemeral state, so you can use them
-        # multiple times during your forward pass
-        # while x.norm(2) < 10:
-        #    x = self.conv1(x)
-
-        x = x.view(x.size(0), -1)
-        x = F.relu(self.fc1(x))
-        x = F.relu(self.fc2(x))
-        return x
-
-###############################################################
-# Let's use the defined ConvNet now.
-# You create an instance of the class first.
-
-
-net = MNISTConvNet()
-print(net)
-
-########################################################################
-# .. note::
-#
-#     ``torch.nn`` only supports mini-batches The entire ``torch.nn``
-#     package only supports inputs that are a mini-batch of samples, and not
-#     a single sample.
-#
-#     For example, ``nn.Conv2d`` will take in a 4D Tensor of
-#     ``nSamples x nChannels x Height x Width``.
-#
-#     If you have a single sample, just use ``input.unsqueeze(0)`` to add
-#     a fake batch dimension.
-#
-# Create a mini-batch containing a single sample of random data and send the
-# sample through the ConvNet.
-
-input = Variable(torch.randn(1, 1, 28, 28))
-out = net(input)
-print(out.size())
-
-########################################################################
-# Define a dummy target label and compute error using a loss function.
-
-target = Variable(torch.LongTensor([3]))
-loss_fn = nn.CrossEntropyLoss()  # LogSoftmax + ClassNLL Loss
-err = loss_fn(out, target)
-err.backward()
-
-print(err)
-
-########################################################################
-# The output of the ConvNet ``out`` is a ``Variable``. We compute the loss
-# using that, and that results in ``err`` which is also a ``Variable``.
-# Calling ``.backward`` on ``err`` hence will propagate gradients all the
-# way through the ConvNet to it’s weights
-#
-# Let's access individual layer weights and gradients:
-
-print(net.conv1.weight.grad.size())
-
-########################################################################
-print(net.conv1.weight.data.norm())  # norm of the weight
-print(net.conv1.weight.grad.data.norm())  # norm of the gradients
-
-########################################################################
-# Forward and Backward Function Hooks
-# -----------------------------------
-#
-# We’ve inspected the weights and the gradients. But how about inspecting
-# / modifying the output and grad\_output of a layer?
-#
-# We introduce **hooks** for this purpose.
-#
-# You can register a function on a ``Module`` or a ``Variable``.
-# The hook can be a forward hook or a backward hook.
-# The forward hook will be executed when a forward call is executed.
-# The backward hook will be executed in the backward phase.
-# Let’s look at an example.
-#
-# We register a forward hook on conv2 and print some information
-
-
-def printnorm(self, input, output):
-    # input is a tuple of packed inputs
-    # output is a Variable. output.data is the Tensor we are interested
-    print('Inside ' + self.__class__.__name__ + ' forward')
-    print('')
-    print('input: ', type(input))
-    print('input[0]: ', type(input[0]))
-    print('output: ', type(output))
-    print('')
-    print('input size:', input[0].size())
-    print('output size:', output.data.size())
-    print('output norm:', output.data.norm())
-
-
-net.conv2.register_forward_hook(printnorm)
-
-out = net(input)
-
-########################################################################
-#
-# We register a backward hook on conv2 and print some information
-
-
-def printgradnorm(self, grad_input, grad_output):
-    print('Inside ' + self.__class__.__name__ + ' backward')
-    print('Inside class:' + self.__class__.__name__)
-    print('')
-    print('grad_input: ', type(grad_input))
-    print('grad_input[0]: ', type(grad_input[0]))
-    print('grad_output: ', type(grad_output))
-    print('grad_output[0]: ', type(grad_output[0]))
-    print('')
-    print('grad_input size:', grad_input[0].size())
-    print('grad_output size:', grad_output[0].size())
-    print('grad_input norm:', grad_input[0].data.norm())
-
-
-net.conv2.register_backward_hook(printgradnorm)
-
-out = net(input)
-err = loss_fn(out, target)
-err.backward()
-
-########################################################################
-# A full and working MNIST example is located here
-# https://github.com/pytorch/examples/tree/master/mnist
-#
-# Example 2: Recurrent Net
-# ------------------------
-#
-# Next, let’s look at building recurrent nets with PyTorch.
-#
-# Since the state of the network is held in the graph and not in the
-# layers, you can simply create an nn.Linear and reuse it over and over
-# again for the recurrence.
-
-
-class RNN(nn.Module):
-
-    # you can also accept arguments in your model constructor
-    def __init__(self, data_size, hidden_size, output_size):
-        super(RNN, self).__init__()
-
-        self.hidden_size = hidden_size
-        input_size = data_size + hidden_size
-
-        self.i2h = nn.Linear(input_size, hidden_size)
-        self.h2o = nn.Linear(hidden_size, output_size)
-
-    def forward(self, data, last_hidden):
-        input = torch.cat((data, last_hidden), 1)
-        hidden = self.i2h(input)
-        output = self.h2o(hidden)
-        return hidden, output
-
-
-rnn = RNN(50, 20, 10)
-
-########################################################################
-#
-# A more complete Language Modeling example using LSTMs and Penn Tree-bank
-# is located
-# `here <https://github.com/pytorch/examples/tree/master/word\_language\_model>`_
-#
-# PyTorch by default has seamless CuDNN integration for ConvNets and
-# Recurrent Nets
-
-loss_fn = nn.MSELoss()
-
-batch_size = 10
-TIMESTEPS = 5
-
-# Create some fake data
-batch = Variable(torch.randn(batch_size, 50))
-hidden = Variable(torch.zeros(batch_size, 20))
-target = Variable(torch.zeros(batch_size, 10))
-
-loss = 0
-for t in range(TIMESTEPS):
-    # yes! you can reuse the same network several times,
-    # sum up the losses, and call backward!
-    hidden, output = rnn(batch, hidden)
-    loss += loss_fn(output, target)
-loss.backward()
diff --git a/beginner_source/former_torchies/nnft_tutorial.rst b/beginner_source/former_torchies/nnft_tutorial.rst
new file mode 100644
index 00000000000..db378a7162b
--- /dev/null
+++ b/beginner_source/former_torchies/nnft_tutorial.rst
@@ -0,0 +1,8 @@
+nn Package
+===============
+
+This tutorial is out of date. You'll be redirected to the new tutorial in 3 seconds: https://pytorch.org/tutorials/beginner/nn_tutorial.html
+
+.. raw:: html
+
+   <meta http-equiv="Refresh" content="3; url='https://pytorch.org/tutorials/beginner/nn_tutorial.html'" />
diff --git a/beginner_source/former_torchies/parallelism_tutorial.py b/beginner_source/former_torchies/parallelism_tutorial.py
deleted file mode 100644
index a97397872ae..00000000000
--- a/beginner_source/former_torchies/parallelism_tutorial.py
+++ /dev/null
@@ -1,127 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Multi-GPU examples
-==================
-
-Data Parallelism is when we split the mini-batch of samples into
-multiple smaller mini-batches and run the computation for each of the
-smaller mini-batches in parallel.
-
-Data Parallelism is implemented using ``torch.nn.DataParallel``.
-One can wrap a Module in ``DataParallel`` and it will be parallelized
-over multiple GPUs in the batch dimension.
-
-DataParallel
--------------
-"""
-import torch.nn as nn
-
-
-class DataParallelModel(nn.Module):
-
-    def __init__(self):
-        super().__init__()
-        self.block1 = nn.Linear(10, 20)
-
-        # wrap block2 in DataParallel
-        self.block2 = nn.Linear(20, 20)
-        self.block2 = nn.DataParallel(self.block2)
-
-        self.block3 = nn.Linear(20, 20)
-
-    def forward(self, x):
-        x = self.block1(x)
-        x = self.block2(x)
-        x = self.block3(x)
-        return x
-
-########################################################################
-# The code does not need to be changed in CPU-mode.
-#
-# The documentation for DataParallel is
-# `here <http://pytorch.org/docs/nn.html#torch.nn.DataParallel>`_.
-#
-# **Primitives on which DataParallel is implemented upon:**
-#
-#
-# In general, pytorch’s `nn.parallel` primitives can be used independently.
-# We have implemented simple MPI-like primitives:
-#
-# - replicate: replicate a Module on multiple devices
-# - scatter: distribute the input in the first-dimension
-# - gather: gather and concatenate the input in the first-dimension
-# - parallel\_apply: apply a set of already-distributed inputs to a set of
-#   already-distributed models.
-#
-# To give a better clarity, here function ``data_parallel`` composed using
-# these collectives
-
-
-def data_parallel(module, input, device_ids, output_device=None):
-    if not device_ids:
-        return module(input)
-
-    if output_device is None:
-        output_device = device_ids[0]
-
-    replicas = nn.parallel.replicate(module, device_ids)
-    inputs = nn.parallel.scatter(input, device_ids)
-    replicas = replicas[:len(inputs)]
-    outputs = nn.parallel.parallel_apply(replicas, inputs)
-    return nn.parallel.gather(outputs, output_device)
-
-########################################################################
-# Part of the model on CPU and part on the GPU
-# --------------------------------------------
-#
-# Let’s look at a small example of implementing a network where part of it
-# is on the CPU and part on the GPU
-
-
-class DistributedModel(nn.Module):
-
-    def __init__(self):
-        super().__init__(
-            embedding=nn.Embedding(1000, 10),
-            rnn=nn.Linear(10, 10).cuda(0),
-        )
-
-    def forward(self, x):
-        # Compute embedding on CPU
-        x = self.embedding(x)
-
-        # Transfer to GPU
-        x = x.cuda(0)
-
-        # Compute RNN on GPU
-        x = self.rnn(x)
-        return x
-
-########################################################################
-#
-# This was a small introduction to PyTorch for former Torch users.
-# There’s a lot more to learn.
-#
-# Look at our more comprehensive introductory tutorial which introduces
-# the ``optim`` package, data loaders etc.: :doc:`/beginner/deep_learning_60min_blitz`.
-#
-# Also look at
-#
-# -  `Train neural nets to play video games`_
-# -  `Train a state-of-the-art ResNet network on imagenet`_
-# -  `Train an face generator using Generative Adversarial Networks`_
-# -  `Train a word-level language model using Recurrent LSTM networks`_
-# -  `More examples`_
-# -  `More tutorials`_
-# -  `Discuss PyTorch on the Forums`_
-# -  `Chat with other users on Slack`_
-#
-# .. _`Deep Learning with PyTorch: a 60-minute blitz`: https://github.com/pytorch/tutorials/blob/master/Deep%20Learning%20with%20PyTorch.ipynb
-# .. _Train neural nets to play video games: https://goo.gl/uGOksc
-# .. _Train a state-of-the-art ResNet network on imagenet: https://github.com/pytorch/examples/tree/master/imagenet
-# .. _Train an face generator using Generative Adversarial Networks: https://github.com/pytorch/examples/tree/master/dcgan
-# .. _Train a word-level language model using Recurrent LSTM networks: https://github.com/pytorch/examples/tree/master/word_language_model
-# .. _More examples: https://github.com/pytorch/examples
-# .. _More tutorials: https://github.com/pytorch/tutorials
-# .. _Discuss PyTorch on the Forums: https://discuss.pytorch.org/
-# .. _Chat with other users on Slack: pytorch.slack.com/messages/beginner/
diff --git a/beginner_source/former_torchies/parallelism_tutorial.rst b/beginner_source/former_torchies/parallelism_tutorial.rst
new file mode 100644
index 00000000000..04bb1d69e57
--- /dev/null
+++ b/beginner_source/former_torchies/parallelism_tutorial.rst
@@ -0,0 +1,8 @@
+Multi-GPU Examples
+==============
+
+This tutorial is out of date. You'll be redirected to the new tutorial in 3 seconds: https://pytorch.org/tutorials/beginner/blitz/data_parallel_tutorial.html
+
+.. raw:: html
+
+   <meta http-equiv="Refresh" content="3; url='https://pytorch.org/tutorials/beginner/blitz/data_parallel_tutorial.html'" />
diff --git a/beginner_source/former_torchies/tensor_tutorial.py b/beginner_source/former_torchies/tensor_tutorial.py
deleted file mode 100644
index 4b2004a5fe1..00000000000
--- a/beginner_source/former_torchies/tensor_tutorial.py
+++ /dev/null
@@ -1,141 +0,0 @@
-"""
-Tensors
-=======
-
-Tensors behave almost exactly the same way in PyTorch as they do in
-Torch.
-
-Create a tensor of size (5 x 7) with uninitialized memory:
-
-"""
-
-import torch
-a = torch.FloatTensor(5, 7)
-
-###############################################################
-# Initialize a tensor randomized with a normal distribution with mean=0, var=1:
-
-a = torch.randn(5, 7)
-print(a)
-print(a.size())
-
-###############################################################
-# .. note::
-#     ``torch.Size`` is in fact a tuple, so it supports the same operations
-#
-# Inplace / Out-of-place
-# ----------------------
-#
-# The first difference is that ALL operations on the tensor that operate
-# in-place on it will have an ``_`` postfix. For example, ``add`` is the
-# out-of-place version, and ``add_`` is the in-place version.
-
-a.fill_(3.5)
-# a has now been filled with the value 3.5
-
-b = a.add(4.0)
-# a is still filled with 3.5
-# new tensor b is returned with values 3.5 + 4.0 = 7.5
-
-print(a, b)
-
-###############################################################
-# Some operations like ``narrow`` do not have in-place versions, and
-# hence, ``.narrow_`` does not exist. Similarly, some operations like
-# ``fill_`` do not have an out-of-place version, so ``.fill`` does not
-# exist.
-#
-# Zero Indexing
-# -------------
-#
-# Another difference is that Tensors are zero-indexed. (In lua, tensors are
-# one-indexed)
-
-b = a[0, 3]  # select 1st row, 4th column from a
-
-###############################################################
-# Tensors can be also indexed with Python's slicing
-
-b = a[:, 3:5]  # selects all rows, 4th column and  5th column from a
-
-###############################################################
-# No camel casing
-# ---------------
-#
-# The next small difference is that all functions are now NOT camelCase
-# anymore. For example ``indexAdd`` is now called ``index_add_``
-
-
-x = torch.ones(5, 5)
-print(x)
-
-###############################################################
-#
-
-z = torch.Tensor(5, 2)
-z[:, 0] = 10
-z[:, 1] = 100
-print(z)
-
-###############################################################
-#
-x.index_add_(1, torch.LongTensor([4, 0]), z)
-print(x)
-
-###############################################################
-# Numpy Bridge
-# ------------
-#
-# Converting a torch Tensor to a numpy array and vice versa is a breeze.
-# The torch Tensor and numpy array will share their underlying memory
-# locations, and changing one will change the other.
-#
-# Converting torch Tensor to numpy Array
-# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-a = torch.ones(5)
-print(a)
-
-###############################################################
-#
-
-b = a.numpy()
-print(b)
-
-###############################################################
-#
-a.add_(1)
-print(a)
-print(b) 	# see how the numpy array changed in value
-
-
-###############################################################
-# Converting numpy Array to torch Tensor
-# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-import numpy as np
-a = np.ones(5)
-b = torch.from_numpy(a)
-np.add(a, 1, out=a)
-print(a)
-print(b)  # see how changing the np array changed the torch Tensor automatically
-
-###############################################################
-# All the Tensors on the CPU except a CharTensor support converting to
-# NumPy and back.
-#
-# CUDA Tensors
-# ------------
-#
-# CUDA Tensors are nice and easy in pytorch, and transfering a CUDA tensor
-# from the CPU to GPU will retain its underlying type.
-
-# let us run this cell only if CUDA is available
-if torch.cuda.is_available():
-    # creates a LongTensor and transfers it
-    # to GPU as torch.cuda.LongTensor
-    a = torch.LongTensor(10).fill_(3).cuda()
-    print(type(a))
-    b = a.cpu()
-    # transfers it to CPU, back to
-    # being a torch.LongTensor
diff --git a/beginner_source/former_torchies/tensor_tutorial_old.rst b/beginner_source/former_torchies/tensor_tutorial_old.rst
new file mode 100644
index 00000000000..939a6855c27
--- /dev/null
+++ b/beginner_source/former_torchies/tensor_tutorial_old.rst
@@ -0,0 +1,8 @@
+Tensors
+==============
+
+This tutorial is out of date. You'll be redirected to the new tutorial in 3 seconds: https://pytorch.org/tutorials/beginner/basics/tensorqs_tutorial.html
+
+.. raw:: html
+
+   <meta http-equiv="Refresh" content="3; url='https://pytorch.org/tutorials/beginner/basics/tensorqs_tutorial.html'" />
diff --git a/beginner_source/former_torchies_tutorial.rst b/beginner_source/former_torchies_tutorial.rst
index 751da38b83d..5071a62e73c 100644
--- a/beginner_source/former_torchies_tutorial.rst
+++ b/beginner_source/former_torchies_tutorial.rst
@@ -1,38 +1,9 @@
-PyTorch for former Torch users
-------------------------------
-**Author**: `Soumith Chintala <http://soumith.ch>`_
+PyTorch for Former Torch Users
+==============
+This tutorial is out of date. Please check out the PyTorch tutorials here: https://pytorch.org/tutorials/
 
-In this tutorial, you will learn the following:
-
-1. Using torch Tensors, and important difference against (Lua)Torch
-2. Using the autograd package
-3. Building neural networks
-
-  -  Building a ConvNet
-  -  Building a Recurrent Net
-
-4. Use multiple GPUs
-
-
-.. toctree::
-   :hidden:
-
-   /beginner/former_torchies/tensor_tutorial
-   /beginner/former_torchies/autograd_tutorial
-   /beginner/former_torchies/nn_tutorial
-   /beginner/former_torchies/parallelism_tutorial
-
-.. galleryitem:: /beginner/former_torchies/tensor_tutorial.py
-    :figure: /_static/img/tensor_illustration_flat.png
-
-.. galleryitem:: /beginner/former_torchies/autograd_tutorial.py
-    :figure: /_static/img/Variable.png
-
-.. galleryitem:: /beginner/former_torchies/nn_tutorial.py
-    :figure: /_static/img/torch-nn-vs-pytorch-nn.png
-
-.. galleryitem:: /beginner/former_torchies/parallelism_tutorial.py
+You will be redirected in 3 seconds.
 
 .. raw:: html
 
-    <div style='clear:both'></div>
\ No newline at end of file
+   <meta http-equiv="Refresh" content="3; url='https://pytorch.org/tutorials/'" />
diff --git a/beginner_source/hta_intro_tutorial.rst b/beginner_source/hta_intro_tutorial.rst
new file mode 100644
index 00000000000..dc7c8cedf9e
--- /dev/null
+++ b/beginner_source/hta_intro_tutorial.rst
@@ -0,0 +1,390 @@
+Introduction to Holistic Trace Analysis
+=======================================
+
+**Author:** `Anupam Bhatnagar <https://github.com/anupambhatnagar>`_
+
+In this tutorial, we demonstrate how to use Holistic Trace Analysis (HTA) to
+analyze traces from a distributed training job. To get started follow the steps
+below.
+
+Installing HTA
+~~~~~~~~~~~~~~
+
+We recommend using a Conda environment to install HTA. To install Anaconda, see
+`the official Anaconda documentation <https://docs.anaconda.com/anaconda/install/index.html>`_.
+
+1. Install HTA using pip:
+
+   .. code-block:: python
+
+      pip install HolisticTraceAnalysis
+
+2. (Optional and recommended) Set up a Conda environment:
+
+   .. code-block:: python
+
+      # create the environment env_name
+      conda create -n env_name
+
+      # activate the environment
+      conda activate env_name
+
+      # When you are done, deactivate the environment by running ``conda deactivate``
+
+Getting Started
+~~~~~~~~~~~~~~~
+
+Launch a Jupyter notebook and set the ``trace_dir`` variable to the location of the traces.
+
+.. code-block:: python
+
+   from hta.trace_analysis import TraceAnalysis
+   trace_dir = "/path/to/folder/with/traces"
+   analyzer = TraceAnalysis(trace_dir=trace_dir)
+
+
+Temporal Breakdown
+------------------
+
+To effectively utilize the GPUs, it is crucial to understand how they are spending
+time for a specific job. Are they primarily engaged in computation, communication,
+memory events, or are they idle? The temporal breakdown feature provides a detailed
+analysis of the time spent in these three categories.
+
+* Idle time - GPU is idle.
+* Compute time - GPU is being used for matrix multiplications or vector operations.
+* Non-compute time - GPU is being used for communication or memory events.
+
+To achieve high training efficiency, the code should maximize compute time and
+minimize idle time and non-compute time. The following function generates a
+dataframe that provides a detailed breakdown of the temporal usage for each rank.
+
+.. code-block:: python
+
+   analyzer = TraceAnalysis(trace_dir = "/path/to/trace/folder")
+   time_spent_df = analyzer.get_temporal_breakdown()
+
+
+.. image:: ../_static/img/hta/temporal_breakdown_df.png
+
+When the ``visualize`` argument is set to ``True`` in the `get_temporal_breakdown
+<https://hta.readthedocs.io/en/latest/source/api/trace_analysis_api.html#hta.trace_analysis.TraceAnalysis.get_temporal_breakdown>`_
+function it also generates a bar graph representing the breakdown by rank.
+
+.. image:: ../_static/img/hta/temporal_breakdown_plot.png
+
+
+Idle Time Breakdown
+-------------------
+
+Gaining insight into the amount of time the GPU spends idle and the
+reasons behind it can help guide optimization strategies. A GPU is
+considered idle when no kernel is running on it. We have developed an
+algorithm to categorize the `Idle` time into three distinct categories:
+
+* **Host wait:** refers to the idle time on the GPU that is caused by
+  the CPU not enqueuing kernels quickly enough to keep the GPU fully utilized.
+  These types of inefficiencies can be addressed by examining the CPU
+  operators that are contributing to the slowdown, increasing the batch
+  size and applying operator fusion.
+
+* **Kernel wait:** This refers to brief overhead associated with launching
+  consecutive kernels on the GPU. The idle time attributed to this category
+  can be minimized by using CUDA Graph optimizations.
+
+* **Other wait:** This category includes idle time that cannot currently
+  be attributed due to insufficient information. The likely causes include
+  synchronization among CUDA streams using CUDA events and delays in launching
+  kernels.
+
+The host wait time can be interpreted as the time when the GPU is stalling due
+to the CPU. To attribute the idle time as kernel wait we use the following
+heuristic:
+
+   | **gap between consecutive kernels < threshold**
+
+The default threshold value is 30 nanoseconds and can be configured using the
+``consecutive_kernel_delay`` argument. By default, the idle time breakdown is
+computed for rank 0 only. In order to calculate the breakdown for other ranks,
+use the ``ranks`` argument in the `get_idle_time_breakdown
+<https://hta.readthedocs.io/en/latest/source/api/trace_analysis_api.html#hta.trace_analysis.TraceAnalysis.get_idle_time_breakdown>`_
+function. The idle time breakdown can be generated as follows:
+
+.. code-block:: python
+
+  analyzer = TraceAnalysis(trace_dir = "/path/to/trace/folder")
+  idle_time_df = analyzer.get_idle_time_breakdown()
+
+.. image:: ../_static/img/hta/idle_time_breakdown_percentage.png
+
+The function returns a tuple of dataframes. The first dataframe contains the
+idle time by category on each stream for each rank.
+
+.. image:: ../_static/img/hta/idle_time.png
+   :scale: 100%
+   :align: center
+
+The second dataframe is generated when ``show_idle_interval_stats`` is set to
+``True``. It contains the summary statistics of the idle time for each stream
+on each rank.
+
+.. image:: ../_static/img/hta/idle_time_summary.png
+   :scale: 100%
+   
+.. tip::
+
+   By default, the idle time breakdown presents the percentage of each of the
+   idle time categories. Setting the ``visualize_pctg`` argument to ``False``,
+   the function renders with absolute time on the y-axis. 
+
+
+Kernel Breakdown
+----------------
+
+The kernel breakdown feature breaks down the time spent for each kernel type,
+such as communication (COMM), computation (COMP), and memory (MEM), across all
+ranks and presents the proportion of time spent in each category. Here is the
+percentage of time spent in each category as a pie chart:
+
+.. image:: ../_static/img/hta/kernel_type_breakdown.png
+   :align: center
+
+The kernel breakdown can be calculated as follows:
+
+.. code-block:: python
+
+   analyzer = TraceAnalysis(trace_dir = "/path/to/trace/folder")
+   kernel_type_metrics_df, kernel_metrics_df = analyzer.get_gpu_kernel_breakdown()
+
+The first dataframe returned by the function contains the raw values used to
+generate the pie chart.
+
+Kernel Duration Distribution
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The second dataframe returned by `get_gpu_kernel_breakdown
+<https://hta.readthedocs.io/en/latest/source/api/trace_analysis_api.html#hta.trace_analysis.TraceAnalysis.get_gpu_kernel_breakdown>`_
+contains duration summary statistics for each kernel. In particular, this
+includes the count, min, max, average, standard deviation, sum, and kernel type
+for each kernel on each rank.
+
+.. image:: ../_static/img/hta/kernel_metrics_df.png
+   :align: center
+
+Using this data HTA creates many visualizations to identify performance
+bottlenecks.
+
+#. Pie charts of the top kernels for each kernel type for each rank.
+
+#. Bar graphs of the average duration across all ranks for each of the top
+   kernels and for each kernel type.
+
+.. image:: ../_static/img/hta/pie_charts.png
+
+.. tip::
+
+   All images are generated using plotly. Hovering on the graph shows the
+   mode bar on the top right which allows the user to zoom, pan, select, and
+   download the graph.
+
+The pie charts above show the top 5 computation, communication, and memory
+kernels. Similar pie charts are generated for each rank. The pie charts can be
+configured to show the top k kernels using the ``num_kernels`` argument passed
+to the `get_gpu_kernel_breakdown` function. Additionally, the
+``duration_ratio`` argument can be used to tune the percentage of time that
+needs to be analyzed. If both ``num_kernels`` and ``duration_ratio`` are
+specified, then ``num_kernels`` takes precedence.
+
+.. image:: ../_static/img/hta/comm_across_ranks.png
+
+The bar graph above shows the average duration of the NCCL AllReduce kernel
+across all the ranks. The black lines indicate the minimum and maximum time
+taken on each rank.
+
+.. warning::
+   When using jupyter-lab set the "image_renderer" argument value to
+   "jupyterlab" otherwise the graphs will not render in the notebook.
+
+For a detailed walkthrough of this feature see the `gpu_kernel_breakdown
+notebook
+<https://github.com/facebookresearch/HolisticTraceAnalysis/blob/main/examples/kernel_breakdown_demo.ipynb>`_
+in the examples folder of the repo.
+
+
+Communication Computation Overlap
+---------------------------------
+
+In distributed training, a significant amount of time is spent in communication
+and synchronization events between GPUs. To achieve high GPU efficiency (such as
+TFLOPS/GPU), it is crucial to keep the GPU oversubscribed with computation
+kernels. In other words, the GPU should not be blocked due to unresolved data
+dependencies. One way to measure the extent to which computation is blocked by
+data dependencies is to calculate the communication computation overlap. Higher
+GPU efficiency is observed if communication events overlap computation events.
+Lack of communication and computation overlap will lead to the GPU being idle,
+resulting in low efficiency.
+To sum up, a higher communication computation overlap is desirable. To calculate
+the overlap percentage for each rank, we measure the following ratio:
+
+  | **(time spent in computation while communicating) / (time spent in communication)**
+
+The communication computation overlap can be calculated as follows:
+
+.. code-block:: python
+
+   analyzer = TraceAnalysis(trace_dir = "/path/to/trace/folder")
+   overlap_df = analyzer.get_comm_comp_overlap()
+
+The function returns a dataframe containing the overlap percentage
+for each rank.
+
+.. image:: ../_static/img/hta/overlap_df.png
+   :align: center
+   :scale: 50%
+
+When the ``visualize`` argument is set to True, the `get_comm_comp_overlap
+<https://hta.readthedocs.io/en/latest/source/api/trace_analysis_api.html#hta.trace_analysis.TraceAnalysis.get_comm_comp_overlap>`_
+function also generates a bar graph representing the overlap by rank.
+
+.. image:: ../_static/img/hta/overlap_plot.png
+
+
+Augmented Counters
+------------------
+
+Memory Bandwidth & Queue Length Counters
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Memory bandwidth counters measure the memory copy bandwidth used while copying
+the data from H2D, D2H and D2D by memory copy (memcpy) and memory set (memset)
+events. HTA also computes the number of outstanding operations on each CUDA
+stream. We refer to this as **queue length**. When the queue length on a stream
+is 1024 or larger new events cannot be scheduled on that stream and the CPU
+will stall until the events on the GPU stream have processed.
+
+The `generate_trace_with_counters
+<https://hta.readthedocs.io/en/latest/source/api/trace_analysis_api.html#hta.trace_analysis.TraceAnalysis.generate_trace_with_counters>`_
+API outputs a new trace file with the memory bandwidth and queue length
+counters. The new trace file contains tracks which indicate the memory
+bandwidth used by memcpy/memset operations and tracks for the queue length on
+each stream. By default, these counters are generated using the rank 0
+trace file, and the new file contains the suffix ``_with_counters`` in its name.
+Users have the option to generate the counters for multiple ranks by using the
+``ranks`` argument in the ``generate_trace_with_counters`` API.
+
+.. code-block:: python
+
+  analyzer = TraceAnalysis(trace_dir = "/path/to/trace/folder")
+  analyzer.generate_trace_with_counters()
+
+A screenshot of the generated trace file with augmented counters.
+
+.. image:: ../_static/img/hta/mem_bandwidth_queue_length.png
+   :scale: 100%
+
+HTA also provides a summary of the memory copy bandwidth and queue length
+counters as well as the time series of the counters for the profiled portion of
+the code using the following API:
+
+* `get_memory_bw_summary <https://hta.readthedocs.io/en/latest/source/api/trace_analysis_api.html#hta.trace_analysis.TraceAnalysis.get_memory_bw_summary>`_
+
+* `get_queue_length_summary <https://hta.readthedocs.io/en/latest/source/api/trace_analysis_api.html#hta.trace_analysis.TraceAnalysis.get_queue_length_summary>`_
+
+* `get_memory_bw_time_series <https://hta.readthedocs.io/en/latest/source/api/trace_analysis_api.html#hta.trace_analysis.TraceAnalysis.get_memory_bw_time_series>`_
+
+* `get_queue_length_time_series <https://hta.readthedocs.io/en/latest/source/api/trace_analysis_api.html#hta.trace_analysis.TraceAnalysis.get_queue_length_time_series>`_
+
+To view the summary and time series, use:
+
+.. code-block:: python
+
+  # generate summary
+  mem_bw_summary = analyzer.get_memory_bw_summary()
+  queue_len_summary = analyzer.get_queue_length_summary()
+
+  # get time series
+  mem_bw_series = analyzer.get_memory_bw_time_series()
+  queue_len_series = analyzer.get_queue_length_series()
+
+The summary contains the count, min, max, mean, standard deviation, 25th, 50th,
+and 75th percentile.
+
+.. image:: ../_static/img/hta/queue_length_summary.png
+   :scale: 100%
+   :align: center
+
+The time series only contains the points when a value changes. Once a value is
+observed the time series stays constant until the next update. The memory
+bandwidth and queue length time series functions return a dictionary whose key
+is the rank and the value is the time series for that rank. By default, the
+time series is computed for rank 0 only.
+
+CUDA Kernel Launch Statistics
+-----------------------------
+
+.. image:: ../_static/img/hta/cuda_kernel_launch.png
+
+For each event launched on the GPU, there is a corresponding scheduling event on
+the CPU, such as ``CudaLaunchKernel``, ``CudaMemcpyAsync``, ``CudaMemsetAsync``.
+These events are linked by a common correlation ID in the trace - see the figure
+above. This feature computes the duration of the CPU runtime event, its corresponding GPU
+kernel and the launch delay, for example, the difference between GPU kernel starting and
+CPU operator ending. The kernel launch info can be generated as follows:
+
+.. code-block:: python
+
+  analyzer = TraceAnalysis(trace_dir="/path/to/trace/dir")
+  kernel_info_df = analyzer.get_cuda_kernel_launch_stats()
+
+A screenshot of the generated dataframe is given below.
+
+.. image:: ../_static/img/hta/cuda_kernel_launch_stats.png
+   :scale: 100%
+   :align: center
+
+The duration of the CPU op, GPU kernel, and the launch delay allow us to find
+the following:
+
+* **Short GPU kernels** - GPU kernels with duration less than the corresponding
+  CPU runtime event.
+
+* **Runtime event outliers** - CPU runtime events with excessive duration.
+
+* **Launch delay outliers** - GPU kernels which take too long to be scheduled.
+
+HTA generates distribution plots for each of the aforementioned three categories.
+
+**Short GPU kernels**
+
+Typically, the launch time on the CPU side ranges from 5-20 microseconds. In some
+cases, the GPU execution time is lower than the launch time itself. The graph
+below helps us to find how frequently such instances occur in the code.
+
+.. image:: ../_static/img/hta/short_gpu_kernels.png
+
+
+**Runtime event outliers**
+
+The runtime outliers depend on the cutoff used to classify the outliers, hence
+the `get_cuda_kernel_launch_stats
+<https://hta.readthedocs.io/en/latest/source/api/trace_analysis_api.html#hta.trace_analysis.TraceAnalysis.get_cuda_kernel_launch_stats>`_
+API provides the ``runtime_cutoff`` argument to configure the value.
+
+.. image:: ../_static/img/hta/runtime_outliers.png
+
+**Launch delay outliers**
+
+The launch delay outliers depend on the cutoff used to classify the outliers,
+hence the `get_cuda_kernel_launch_stats` API provides the
+``launch_delay_cutoff`` argument to configure the value.
+
+.. image:: ../_static/img/hta/launch_delay_outliers.png
+
+
+Conclusion
+~~~~~~~~~~
+
+In this tutorial, you have learned how to install and use HTA,
+a performance tool that enables you analyze bottlenecks in your distributed
+training workflows. To learn how you can use the HTA tool to perform trace
+diff analysis, see `Trace Diff using Holistic Trace Analysis <https://pytorch.org/tutorials/beginner/hta_trace_diff_tutorial.html>`__.
diff --git a/beginner_source/hta_trace_diff_tutorial.rst b/beginner_source/hta_trace_diff_tutorial.rst
new file mode 100644
index 00000000000..608d29ea358
--- /dev/null
+++ b/beginner_source/hta_trace_diff_tutorial.rst
@@ -0,0 +1,66 @@
+Trace Diff using Holistic Trace Analysis
+========================================
+
+**Author:** `Anupam Bhatnagar <https://github.com/anupambhatnagar>`_
+
+Occasionally, users need to identify the changes in PyTorch operators and CUDA
+kernels resulting from a code change. To support this requirement, HTA
+provides a trace comparison feature. This feature allows the user to input two
+sets of trace files where the first can be thought of as the *control group*
+and the second as the *test group*, similar to an A/B test. The ``TraceDiff`` class
+provides functions to compare the differences between traces and functionality
+to visualize these differences. In particular, users can find operators and
+kernels that were added and removed from each group, along with the frequency
+of each operator/kernel and the cumulative time taken by the operator/kernel.
+
+The `TraceDiff <https://hta.readthedocs.io/en/latest/source/api/trace_diff_api.html>`_ class 
+has the following methods:
+
+* `compare_traces <https://hta.readthedocs.io/en/latest/source/api/trace_diff_api.html#hta.trace_diff.TraceDiff.compare_traces>`_:
+  Compare the frequency and total duration of CPU operators and GPU kernels from
+  two sets of traces.
+
+* `ops_diff <https://hta.readthedocs.io/en/latest/source/api/trace_diff_api.html#hta.trace_diff.TraceDiff.ops_diff>`_:
+  Get the operators and kernels which have been:
+
+    #. **added** to the test trace and are absent in the control trace
+    #. **deleted** from the test trace and are present in the control trace
+    #. **increased** in frequency in the test trace and exist in the control trace
+    #. **decreased** in frequency in the test trace and exist in the control trace
+    #. **unchanged** between the two sets of traces
+
+* `visualize_counts_diff <https://hta.readthedocs.io/en/latest/source/api/trace_diff_api.html#hta.trace_diff.TraceDiff.visualize_counts_diff>`_
+
+* `visualize_duration_diff <https://hta.readthedocs.io/en/latest/source/api/trace_diff_api.html#hta.trace_diff.TraceDiff.visualize_duration_diff>`_
+
+The last two methods can be used to visualize various changes in frequency and
+duration of CPU operators and GPU kernels, using the output of the
+``compare_traces`` method.
+
+For example, the top ten operators with increase in frequency can be computed as
+follows:
+
+.. code-block:: python
+
+    df = compare_traces_output.sort_values(by="diff_counts", ascending=False).head(10)
+    TraceDiff.visualize_counts_diff(df)
+
+.. image:: ../_static/img/hta/counts_diff.png
+
+Similarly, the top ten operators with the largest change in duration can be computed as
+follows:
+
+.. code-block:: python
+
+    df = compare_traces_output.sort_values(by="diff_duration", ascending=False)
+    # The duration differerence can be overshadowed by the "ProfilerStep",
+    # so we can filter it out to show the trend of other operators.
+    df = df.loc[~df.index.str.startswith("ProfilerStep")].head(10)
+    TraceDiff.visualize_duration_diff(df)
+
+.. image:: ../_static/img/hta/duration_diff.png
+
+For a detailed example of this feature see the `trace_diff_demo notebook
+<https://github.com/facebookresearch/HolisticTraceAnalysis/blob/main/examples/trace_diff_demo.ipynb>`_
+in the examples folder of the repository.
+
diff --git a/beginner_source/hyperparameter_tuning_tutorial.py b/beginner_source/hyperparameter_tuning_tutorial.py
new file mode 100644
index 00000000000..dd3fe65699e
--- /dev/null
+++ b/beginner_source/hyperparameter_tuning_tutorial.py
@@ -0,0 +1,512 @@
+# -*- coding: utf-8 -*-
+"""
+Hyperparameter tuning with Ray Tune
+===================================
+
+Hyperparameter tuning can make the difference between an average model and a highly
+accurate one. Often simple things like choosing a different learning rate or changing
+a network layer size can have a dramatic impact on your model performance.
+
+Fortunately, there are tools that help with finding the best combination of parameters.
+`Ray Tune <https://docs.ray.io/en/latest/tune.html>`_ is an industry standard tool for
+distributed hyperparameter tuning. Ray Tune includes the latest hyperparameter search
+algorithms, integrates with various analysis libraries, and natively
+supports distributed training through `Ray's distributed machine learning engine
+<https://ray.io/>`_.
+
+In this tutorial, we will show you how to integrate Ray Tune into your PyTorch
+training workflow. We will extend `this tutorial from the PyTorch documentation
+<https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html>`_ for training
+a CIFAR10 image classifier.
+
+As you will see, we only need to add some slight modifications. In particular, we
+need to
+
+1. wrap data loading and training in functions,
+2. make some network parameters configurable,
+3. add checkpointing (optional),
+4. and define the search space for the model tuning
+
+|
+
+To run this tutorial, please make sure the following packages are
+installed:
+
+-  ``ray[tune]``: Distributed hyperparameter tuning library
+-  ``torchvision``: For the data transformers
+
+Setup / Imports
+---------------
+Let's start with the imports:
+"""
+from functools import partial
+import os
+import tempfile
+from pathlib import Path
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+from torch.utils.data import random_split
+import torchvision
+import torchvision.transforms as transforms
+# sphinx_gallery_start_ignore
+# Fixes ``AttributeError: '_LoggingTee' object has no attribute 'fileno'``.
+# This is only needed to run with sphinx-build.
+import sys
+if not hasattr(sys.stdout, "encoding"):
+    sys.stdout.encoding = "latin1"
+    sys.stdout.fileno = lambda: 0
+# sphinx_gallery_end_ignore
+from ray import tune
+from ray import train
+from ray.train import Checkpoint, get_checkpoint
+from ray.tune.schedulers import ASHAScheduler
+import ray.cloudpickle as pickle
+
+######################################################################
+# Most of the imports are needed for building the PyTorch model. Only the last 
+# imports are for Ray Tune.
+#
+# Data loaders
+# ------------
+# We wrap the data loaders in their own function and pass a global data directory.
+# This way we can share a data directory between different trials.
+
+
+def load_data(data_dir="./data"):
+    transform = transforms.Compose(
+        [transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]
+    )
+
+    trainset = torchvision.datasets.CIFAR10(
+        root=data_dir, train=True, download=True, transform=transform
+    )
+
+    testset = torchvision.datasets.CIFAR10(
+        root=data_dir, train=False, download=True, transform=transform
+    )
+
+    return trainset, testset
+
+
+######################################################################
+# Configurable neural network
+# ---------------------------
+# We can only tune those parameters that are configurable.
+# In this example, we can specify
+# the layer sizes of the fully connected layers:
+
+
+class Net(nn.Module):
+    def __init__(self, l1=120, l2=84):
+        super(Net, self).__init__()
+        self.conv1 = nn.Conv2d(3, 6, 5)
+        self.pool = nn.MaxPool2d(2, 2)
+        self.conv2 = nn.Conv2d(6, 16, 5)
+        self.fc1 = nn.Linear(16 * 5 * 5, l1)
+        self.fc2 = nn.Linear(l1, l2)
+        self.fc3 = nn.Linear(l2, 10)
+
+    def forward(self, x):
+        x = self.pool(F.relu(self.conv1(x)))
+        x = self.pool(F.relu(self.conv2(x)))
+        x = torch.flatten(x, 1)  # flatten all dimensions except batch
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        x = self.fc3(x)
+        return x
+
+
+######################################################################
+# The train function
+# ------------------
+# Now it gets interesting, because we introduce some changes to the example `from the PyTorch
+# documentation <https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html>`_.
+#
+# We wrap the training script in a function ``train_cifar(config, data_dir=None)``.
+# The ``config`` parameter will receive the hyperparameters we would like to
+# train with. The ``data_dir`` specifies the directory where we load and store the data,
+# so that multiple runs can share the same data source.
+# We also load the model and optimizer state at the start of the run, if a checkpoint
+# is provided. Further down in this tutorial you will find information on how
+# to save the checkpoint and what it is used for.
+#
+# .. code-block:: python
+#
+#     net = Net(config["l1"], config["l2"])
+#
+#     checkpoint = get_checkpoint()
+#     if checkpoint:
+#         with checkpoint.as_directory() as checkpoint_dir:
+#             data_path = Path(checkpoint_dir) / "data.pkl"
+#             with open(data_path, "rb") as fp:
+#                 checkpoint_state = pickle.load(fp)
+#             start_epoch = checkpoint_state["epoch"]
+#             net.load_state_dict(checkpoint_state["net_state_dict"])
+#             optimizer.load_state_dict(checkpoint_state["optimizer_state_dict"])
+#     else:
+#         start_epoch = 0
+#
+# The learning rate of the optimizer is made configurable, too:
+#
+# .. code-block:: python
+#
+#     optimizer = optim.SGD(net.parameters(), lr=config["lr"], momentum=0.9)
+#
+# We also split the training data into a training and validation subset. We thus train on
+# 80% of the data and calculate the validation loss on the remaining 20%. The batch sizes
+# with which we iterate through the training and test sets are configurable as well.
+#
+# Adding (multi) GPU support with DataParallel
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# Image classification benefits largely from GPUs. Luckily, we can continue to use
+# PyTorch's abstractions in Ray Tune. Thus, we can wrap our model in ``nn.DataParallel``
+# to support data parallel training on multiple GPUs:
+#
+# .. code-block:: python
+#
+#     device = "cpu"
+#     if torch.cuda.is_available():
+#         device = "cuda:0"
+#         if torch.cuda.device_count() > 1:
+#             net = nn.DataParallel(net)
+#     net.to(device)
+#
+# By using a ``device`` variable we make sure that training also works when we have
+# no GPUs available. PyTorch requires us to send our data to the GPU memory explicitly,
+# like this:
+#
+# .. code-block:: python
+#
+#     for i, data in enumerate(trainloader, 0):
+#         inputs, labels = data
+#         inputs, labels = inputs.to(device), labels.to(device)
+#
+# The code now supports training on CPUs, on a single GPU, and on multiple GPUs. Notably, Ray
+# also supports `fractional GPUs <https://docs.ray.io/en/latest/ray-core/scheduling/accelerators.html#fractional-accelerators>`_
+# so we can share GPUs among trials, as long as the model still fits on the GPU memory. We'll come back
+# to that later.
+#
+# Communicating with Ray Tune
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# The most interesting part is the communication with Ray Tune:
+#
+# .. code-block:: python
+#
+#     checkpoint_data = {
+#         "epoch": epoch,
+#         "net_state_dict": net.state_dict(),
+#         "optimizer_state_dict": optimizer.state_dict(),
+#     }
+#     with tempfile.TemporaryDirectory() as checkpoint_dir:
+#         data_path = Path(checkpoint_dir) / "data.pkl"
+#         with open(data_path, "wb") as fp:
+#             pickle.dump(checkpoint_data, fp)
+#
+#         checkpoint = Checkpoint.from_directory(checkpoint_dir)
+#         train.report(
+#             {"loss": val_loss / val_steps, "accuracy": correct / total},
+#             checkpoint=checkpoint,
+#         )
+#
+# Here we first save a checkpoint and then report some metrics back to Ray Tune. Specifically,
+# we send the validation loss and accuracy back to Ray Tune. Ray Tune can then use these metrics
+# to decide which hyperparameter configuration lead to the best results. These metrics
+# can also be used to stop bad performing trials early in order to avoid wasting
+# resources on those trials.
+#
+# The checkpoint saving is optional, however, it is necessary if we wanted to use advanced
+# schedulers like
+# `Population Based Training <https://docs.ray.io/en/latest/tune/examples/pbt_guide.html>`_.
+# Also, by saving the checkpoint we can later load the trained models and validate them
+# on a test set. Lastly, saving checkpoints is useful for fault tolerance, and it allows
+# us to interrupt training and continue training later.
+#
+# Full training function
+# ~~~~~~~~~~~~~~~~~~~~~~
+#
+# The full code example looks like this:
+
+
+def train_cifar(config, data_dir=None):
+    net = Net(config["l1"], config["l2"])
+
+    device = "cpu"
+    if torch.cuda.is_available():
+        device = "cuda:0"
+        if torch.cuda.device_count() > 1:
+            net = nn.DataParallel(net)
+    net.to(device)
+
+    criterion = nn.CrossEntropyLoss()
+    optimizer = optim.SGD(net.parameters(), lr=config["lr"], momentum=0.9)
+
+    checkpoint = get_checkpoint()
+    if checkpoint:
+        with checkpoint.as_directory() as checkpoint_dir:
+            data_path = Path(checkpoint_dir) / "data.pkl"
+            with open(data_path, "rb") as fp:
+                checkpoint_state = pickle.load(fp)
+            start_epoch = checkpoint_state["epoch"]
+            net.load_state_dict(checkpoint_state["net_state_dict"])
+            optimizer.load_state_dict(checkpoint_state["optimizer_state_dict"])
+    else:
+        start_epoch = 0
+
+    trainset, testset = load_data(data_dir)
+
+    test_abs = int(len(trainset) * 0.8)
+    train_subset, val_subset = random_split(
+        trainset, [test_abs, len(trainset) - test_abs]
+    )
+
+    trainloader = torch.utils.data.DataLoader(
+        train_subset, batch_size=int(config["batch_size"]), shuffle=True, num_workers=8
+    )
+    valloader = torch.utils.data.DataLoader(
+        val_subset, batch_size=int(config["batch_size"]), shuffle=True, num_workers=8
+    )
+
+    for epoch in range(start_epoch, 10):  # loop over the dataset multiple times
+        running_loss = 0.0
+        epoch_steps = 0
+        for i, data in enumerate(trainloader, 0):
+            # get the inputs; data is a list of [inputs, labels]
+            inputs, labels = data
+            inputs, labels = inputs.to(device), labels.to(device)
+
+            # zero the parameter gradients
+            optimizer.zero_grad()
+
+            # forward + backward + optimize
+            outputs = net(inputs)
+            loss = criterion(outputs, labels)
+            loss.backward()
+            optimizer.step()
+
+            # print statistics
+            running_loss += loss.item()
+            epoch_steps += 1
+            if i % 2000 == 1999:  # print every 2000 mini-batches
+                print(
+                    "[%d, %5d] loss: %.3f"
+                    % (epoch + 1, i + 1, running_loss / epoch_steps)
+                )
+                running_loss = 0.0
+
+        # Validation loss
+        val_loss = 0.0
+        val_steps = 0
+        total = 0
+        correct = 0
+        for i, data in enumerate(valloader, 0):
+            with torch.no_grad():
+                inputs, labels = data
+                inputs, labels = inputs.to(device), labels.to(device)
+
+                outputs = net(inputs)
+                _, predicted = torch.max(outputs.data, 1)
+                total += labels.size(0)
+                correct += (predicted == labels).sum().item()
+
+                loss = criterion(outputs, labels)
+                val_loss += loss.cpu().numpy()
+                val_steps += 1
+
+        checkpoint_data = {
+            "epoch": epoch,
+            "net_state_dict": net.state_dict(),
+            "optimizer_state_dict": optimizer.state_dict(),
+        }
+        with tempfile.TemporaryDirectory() as checkpoint_dir:
+            data_path = Path(checkpoint_dir) / "data.pkl"
+            with open(data_path, "wb") as fp:
+                pickle.dump(checkpoint_data, fp)
+
+            checkpoint = Checkpoint.from_directory(checkpoint_dir)
+            train.report(
+                {"loss": val_loss / val_steps, "accuracy": correct / total},
+                checkpoint=checkpoint,
+            )
+    
+    print("Finished Training")
+
+
+######################################################################
+# As you can see, most of the code is adapted directly from the original example.
+#
+# Test set accuracy
+# -----------------
+# Commonly the performance of a machine learning model is tested on a hold-out test
+# set with data that has not been used for training the model. We also wrap this in a
+# function:
+
+
+def test_accuracy(net, device="cpu"):
+    trainset, testset = load_data()
+
+    testloader = torch.utils.data.DataLoader(
+        testset, batch_size=4, shuffle=False, num_workers=2
+    )
+
+    correct = 0
+    total = 0
+    with torch.no_grad():
+        for data in testloader:
+            images, labels = data
+            images, labels = images.to(device), labels.to(device)
+            outputs = net(images)
+            _, predicted = torch.max(outputs.data, 1)
+            total += labels.size(0)
+            correct += (predicted == labels).sum().item()
+
+    return correct / total
+
+
+######################################################################
+# The function also expects a ``device`` parameter, so we can do the
+# test set validation on a GPU.
+#
+# Configuring the search space
+# ----------------------------
+# Lastly, we need to define Ray Tune's search space. Here is an example:
+#
+# .. code-block:: python
+#
+#     config = {
+#         "l1": tune.choice([2 ** i for i in range(9)]),
+#         "l2": tune.choice([2 ** i for i in range(9)]),
+#         "lr": tune.loguniform(1e-4, 1e-1),
+#         "batch_size": tune.choice([2, 4, 8, 16])
+#     }
+#
+# The ``tune.choice()`` accepts a list of values that are uniformly sampled from.
+# In this example, the ``l1`` and ``l2`` parameters
+# should be powers of 2 between 4 and 256, so either 4, 8, 16, 32, 64, 128, or 256.
+# The ``lr`` (learning rate) should be uniformly sampled between 0.0001 and 0.1. Lastly,
+# the batch size is a choice between 2, 4, 8, and 16.
+#
+# At each trial, Ray Tune will now randomly sample a combination of parameters from these
+# search spaces. It will then train a number of models in parallel and find the best
+# performing one among these. We also use the ``ASHAScheduler`` which will terminate bad
+# performing trials early.
+#
+# We wrap the ``train_cifar`` function with ``functools.partial`` to set the constant
+# ``data_dir`` parameter. We can also tell Ray Tune what resources should be
+# available for each trial:
+#
+# .. code-block:: python
+#
+#     gpus_per_trial = 2
+#     # ...
+#     result = tune.run(
+#         partial(train_cifar, data_dir=data_dir),
+#         resources_per_trial={"cpu": 8, "gpu": gpus_per_trial},
+#         config=config,
+#         num_samples=num_samples,
+#         scheduler=scheduler,
+#         checkpoint_at_end=True)
+#
+# You can specify the number of CPUs, which are then available e.g.
+# to increase the ``num_workers`` of the PyTorch ``DataLoader`` instances. The selected
+# number of GPUs are made visible to PyTorch in each trial. Trials do not have access to
+# GPUs that haven't been requested for them - so you don't have to care about two trials
+# using the same set of resources.
+#
+# Here we can also specify fractional GPUs, so something like ``gpus_per_trial=0.5`` is
+# completely valid. The trials will then share GPUs among each other.
+# You just have to make sure that the models still fit in the GPU memory.
+#
+# After training the models, we will find the best performing one and load the trained
+# network from the checkpoint file. We then obtain the test set accuracy and report
+# everything by printing.
+#
+# The full main function looks like this:
+
+
+def main(num_samples=10, max_num_epochs=10, gpus_per_trial=2):
+    data_dir = os.path.abspath("./data")
+    load_data(data_dir)
+    config = {
+        "l1": tune.choice([2**i for i in range(9)]),
+        "l2": tune.choice([2**i for i in range(9)]),
+        "lr": tune.loguniform(1e-4, 1e-1),
+        "batch_size": tune.choice([2, 4, 8, 16]),
+    }
+    scheduler = ASHAScheduler(
+        metric="loss",
+        mode="min",
+        max_t=max_num_epochs,
+        grace_period=1,
+        reduction_factor=2,
+    )
+    result = tune.run(
+        partial(train_cifar, data_dir=data_dir),
+        resources_per_trial={"cpu": 2, "gpu": gpus_per_trial},
+        config=config,
+        num_samples=num_samples,
+        scheduler=scheduler,
+    )
+
+    best_trial = result.get_best_trial("loss", "min", "last")
+    print(f"Best trial config: {best_trial.config}")
+    print(f"Best trial final validation loss: {best_trial.last_result['loss']}")
+    print(f"Best trial final validation accuracy: {best_trial.last_result['accuracy']}")
+
+    best_trained_model = Net(best_trial.config["l1"], best_trial.config["l2"])
+    device = "cpu"
+    if torch.cuda.is_available():
+        device = "cuda:0"
+        if gpus_per_trial > 1:
+            best_trained_model = nn.DataParallel(best_trained_model)
+    best_trained_model.to(device)
+
+    best_checkpoint = result.get_best_checkpoint(trial=best_trial, metric="accuracy", mode="max")
+    with best_checkpoint.as_directory() as checkpoint_dir:
+        data_path = Path(checkpoint_dir) / "data.pkl"
+        with open(data_path, "rb") as fp:
+            best_checkpoint_data = pickle.load(fp)
+
+        best_trained_model.load_state_dict(best_checkpoint_data["net_state_dict"])
+        test_acc = test_accuracy(best_trained_model, device)
+        print("Best trial test set accuracy: {}".format(test_acc))
+
+
+if __name__ == "__main__":
+    # You can change the number of GPUs per trial here:
+    main(num_samples=10, max_num_epochs=10, gpus_per_trial=0)
+
+
+######################################################################
+# If you run the code, an example output could look like this:
+#
+# .. code-block:: sh
+#
+#     Number of trials: 10/10 (10 TERMINATED)
+#     +-----+--------------+------+------+-------------+--------+---------+------------+
+#     | ... |   batch_size |   l1 |   l2 |          lr |   iter |    loss |   accuracy |
+#     |-----+--------------+------+------+-------------+--------+---------+------------|
+#     | ... |            2 |    1 |  256 | 0.000668163 |      1 | 2.31479 |     0.0977 |
+#     | ... |            4 |   64 |    8 | 0.0331514   |      1 | 2.31605 |     0.0983 |
+#     | ... |            4 |    2 |    1 | 0.000150295 |      1 | 2.30755 |     0.1023 |
+#     | ... |           16 |   32 |   32 | 0.0128248   |     10 | 1.66912 |     0.4391 |
+#     | ... |            4 |    8 |  128 | 0.00464561  |      2 | 1.7316  |     0.3463 |
+#     | ... |            8 |  256 |    8 | 0.00031556  |      1 | 2.19409 |     0.1736 |
+#     | ... |            4 |   16 |  256 | 0.00574329  |      2 | 1.85679 |     0.3368 |
+#     | ... |            8 |    2 |    2 | 0.00325652  |      1 | 2.30272 |     0.0984 |
+#     | ... |            2 |    2 |    2 | 0.000342987 |      2 | 1.76044 |     0.292  |
+#     | ... |            4 |   64 |   32 | 0.003734    |      8 | 1.53101 |     0.4761 |
+#     +-----+--------------+------+------+-------------+--------+---------+------------+
+#
+#     Best trial config: {'l1': 64, 'l2': 32, 'lr': 0.0037339984519545164, 'batch_size': 4}
+#     Best trial final validation loss: 1.5310075663924216
+#     Best trial final validation accuracy: 0.4761
+#     Best trial test set accuracy: 0.4737
+#
+# Most trials have been stopped early in order to avoid wasting resources.
+# The best performing trial achieved a validation accuracy of about 47%, which could
+# be confirmed on the test set.
+#
+# So that's it! You can now tune the parameters of your PyTorch models.
diff --git a/beginner_source/introyt.rst b/beginner_source/introyt.rst
new file mode 100644
index 00000000000..9b2a630c245
--- /dev/null
+++ b/beginner_source/introyt.rst
@@ -0,0 +1,10 @@
+Introduction to PyTorch - YouTube Series
+========================================
+
+This page has been moved.
+
+Redirecting now...
+
+.. raw:: html
+
+   <meta http-equiv="Refresh" content="0; url='https://pytorch.org/tutorials/beginner/introyt/introyt_index.html'" />
diff --git a/beginner_source/introyt/README.txt b/beginner_source/introyt/README.txt
new file mode 100644
index 00000000000..b90d269cfab
--- /dev/null
+++ b/beginner_source/introyt/README.txt
@@ -0,0 +1,34 @@
+Introduction to PyTorch on YouTube
+----------------------------------
+
+1. introyt.py
+    Introduction to PyTorch - Youtube Series
+    https://pytorch.org/tutorials/beginner/introyt/introyt.html
+
+2. introyt1_tutorial.py
+    Introduction to PyTorch
+    https://pytorch.org/tutorials/beginner/introyt/introyt1_tutorial.html
+
+3. tensors_deeper_tutorial.py
+    PyTorch Tensors
+    https://pytorch.org/tutorials/beginner/introyt/tensors_deeper_tutorial.html
+
+4. autogradyt_tutorial.py
+    The Fundamentals of Autograd
+    https://pytorch.org/tutorials/beginner/introyt/autogradyt_tutorial.html
+
+5. modelsyt_tutorial.py
+    Building Models with PyTorch
+    https://pytorch.org/tutorials/beginner/introyt/modelsyt_tutorial.html
+
+6. tensorboardyt_tutorial.py
+    PyTorch TensorBoard Support
+    https://pytorch.org/tutorials/beginner/introyt/tensorboardyt_tutorial.html
+
+7. trainingyt_tutorial.py
+    Training with PyTorch
+    https://pytorch.org/tutorials/beginner/introyt/trainingyt_tutorial.html
+    
+8. captumyt_tutorial.py
+    Model Understanding with Captum
+    https://pytorch.org/tutorials/beginner/introyt/captumyt_tutorial.html
diff --git a/beginner_source/introyt/autogradyt_tutorial.py b/beginner_source/introyt/autogradyt_tutorial.py
new file mode 100644
index 00000000000..abf75a7d266
--- /dev/null
+++ b/beginner_source/introyt/autogradyt_tutorial.py
@@ -0,0 +1,655 @@
+"""
+`Introduction <introyt1_tutorial.html>`_ ||
+`Tensors <tensors_deeper_tutorial.html>`_ ||
+**Autograd** ||
+`Building Models <modelsyt_tutorial.html>`_ ||
+`TensorBoard Support <tensorboardyt_tutorial.html>`_ ||
+`Training Models <trainingyt.html>`_ ||
+`Model Understanding <captumyt.html>`_
+
+The Fundamentals of Autograd
+============================
+
+Follow along with the video below or on `youtube <https://www.youtube.com/watch?v=M0fX15_-xrY>`__.
+
+.. raw:: html
+
+   <div style="margin-top:10px; margin-bottom:10px;">
+     <iframe width="560" height="315" src="https://www.youtube.com/embed/M0fX15_-xrY" frameborder="0" allow="accelerometer; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
+   </div>
+
+PyTorch’s *Autograd* feature is part of what make PyTorch flexible and
+fast for building machine learning projects. It allows for the rapid and
+easy computation of multiple partial derivatives (also referred to as
+*gradients)* over a complex computation. This operation is central to
+backpropagation-based neural network learning.
+
+The power of autograd comes from the fact that it traces your
+computation dynamically *at runtime,* meaning that if your model has
+decision branches, or loops whose lengths are not known until runtime,
+the computation will still be traced correctly, and you’ll get correct
+gradients to drive learning. This, combined with the fact that your
+models are built in Python, offers far more flexibility than frameworks
+that rely on static analysis of a more rigidly-structured model for
+computing gradients.
+
+What Do We Need Autograd For?
+-----------------------------
+
+"""
+
+###########################################################################
+# A machine learning model is a *function*, with inputs and outputs. For
+# this discussion, we’ll treat the inputs as an *i*-dimensional vector
+# :math:`\vec{x}`, with elements :math:`x_{i}`. We can then express the
+# model, *M*, as a vector-valued function of the input: :math:`\vec{y} =
+# \vec{M}(\vec{x})`. (We treat the value of M’s output as
+# a vector because in general, a model may have any number of outputs.)
+#
+# Since we’ll mostly be discussing autograd in the context of training,
+# our output of interest will be the model’s loss. The *loss function*
+# L(:math:`\vec{y}`) = L(:math:`\vec{M}`\ (:math:`\vec{x}`)) is a
+# single-valued scalar function of the model’s output. This function
+# expresses how far off our model’s prediction was from a particular
+# input’s *ideal* output. *Note: After this point, we will often omit the
+# vector sign where it should be contextually clear - e.g.,* :math:`y`
+# instead of :math:`\vec y`.
+#
+# In training a model, we want to minimize the loss. In the idealized case
+# of a perfect model, that means adjusting its learning weights - that is,
+# the adjustable parameters of the function - such that loss is zero for
+# all inputs. In the real world, it means an iterative process of nudging
+# the learning weights until we see that we get a tolerable loss for a
+# wide variety of inputs.
+#
+# How do we decide how far and in which direction to nudge the weights? We
+# want to *minimize* the loss, which means making its first derivative
+# with respect to the input equal to 0:
+# :math:`\frac{\partial L}{\partial x} = 0`.
+#
+# Recall, though, that the loss is not *directly* derived from the input,
+# but a function of the model’s output (which is a function of the input
+# directly), :math:`\frac{\partial L}{\partial x}` =
+# :math:`\frac{\partial {L({\vec y})}}{\partial x}`. By the chain rule of
+# differential calculus, we have
+# :math:`\frac{\partial {L({\vec y})}}{\partial x}` =
+# :math:`\frac{\partial L}{\partial y}\frac{\partial y}{\partial x}` =
+# :math:`\frac{\partial L}{\partial y}\frac{\partial M(x)}{\partial x}`.
+#
+# :math:`\frac{\partial M(x)}{\partial x}` is where things get complex.
+# The partial derivatives of the model’s outputs with respect to its
+# inputs, if we were to expand the expression using the chain rule again,
+# would involve many local partial derivatives over every multiplied
+# learning weight, every activation function, and every other mathematical
+# transformation in the model. The full expression for each such partial
+# derivative is the sum of the products of the local gradient of *every
+# possible path* through the computation graph that ends with the variable
+# whose gradient we are trying to measure.
+#
+# In particular, the gradients over the learning weights are of interest
+# to us - they tell us *what direction to change each weight* to get the
+# loss function closer to zero.
+#
+# Since the number of such local derivatives (each corresponding to a
+# separate path through the model’s computation graph) will tend to go up
+# exponentially with the depth of a neural network, so does the complexity
+# in computing them. This is where autograd comes in: It tracks the
+# history of every computation. Every computed tensor in your PyTorch
+# model carries a history of its input tensors and the function used to
+# create it. Combined with the fact that PyTorch functions meant to act on
+# tensors each have a built-in implementation for computing their own
+# derivatives, this greatly speeds the computation of the local
+# derivatives needed for learning.
+#
+# A Simple Example
+# ----------------
+#
+# That was a lot of theory - but what does it look like to use autograd in
+# practice?
+#
+# Let’s start with a straightforward example. First, we’ll do some imports
+# to let us graph our results:
+#
+
+# %matplotlib inline
+
+import torch
+
+import matplotlib.pyplot as plt
+import matplotlib.ticker as ticker
+import math
+
+
+#########################################################################
+# Next, we’ll create an input tensor full of evenly spaced values on the
+# interval :math:`[0, 2{\pi}]`, and specify ``requires_grad=True``. (Like
+# most functions that create tensors, ``torch.linspace()`` accepts an
+# optional ``requires_grad`` option.) Setting this flag means that in
+# every computation that follows, autograd will be accumulating the
+# history of the computation in the output tensors of that computation.
+# 
+
+a = torch.linspace(0., 2. * math.pi, steps=25, requires_grad=True)
+print(a)
+
+
+########################################################################
+# Next, we’ll perform a computation, and plot its output in terms of its
+# inputs:
+# 
+
+b = torch.sin(a)
+plt.plot(a.detach(), b.detach())
+
+
+########################################################################
+# Let’s have a closer look at the tensor ``b``. When we print it, we see
+# an indicator that it is tracking its computation history:
+# 
+
+print(b)
+
+
+#######################################################################
+# This ``grad_fn`` gives us a hint that when we execute the
+# backpropagation step and compute gradients, we’ll need to compute the
+# derivative of :math:`\sin(x)` for all this tensor’s inputs.
+# 
+# Let’s perform some more computations:
+# 
+
+c = 2 * b
+print(c)
+
+d = c + 1
+print(d)
+
+
+##########################################################################
+# Finally, let’s compute a single-element output. When you call
+# ``.backward()`` on a tensor with no arguments, it expects the calling
+# tensor to contain only a single element, as is the case when computing a
+# loss function.
+# 
+
+out = d.sum()
+print(out)
+
+
+##########################################################################
+# Each ``grad_fn`` stored with our tensors allows you to walk the
+# computation all the way back to its inputs with its ``next_functions``
+# property. We can see below that drilling down on this property on ``d``
+# shows us the gradient functions for all the prior tensors. Note that
+# ``a.grad_fn`` is reported as ``None``, indicating that this was an input
+# to the function with no history of its own.
+# 
+
+print('d:')
+print(d.grad_fn)
+print(d.grad_fn.next_functions)
+print(d.grad_fn.next_functions[0][0].next_functions)
+print(d.grad_fn.next_functions[0][0].next_functions[0][0].next_functions)
+print(d.grad_fn.next_functions[0][0].next_functions[0][0].next_functions[0][0].next_functions)
+print('\nc:')
+print(c.grad_fn)
+print('\nb:')
+print(b.grad_fn)
+print('\na:')
+print(a.grad_fn)
+
+
+######################################################################
+# With all this machinery in place, how do we get derivatives out? You
+# call the ``backward()`` method on the output, and check the input’s
+# ``grad`` property to inspect the gradients:
+# 
+
+out.backward()
+print(a.grad)
+plt.plot(a.detach(), a.grad.detach())
+
+
+#########################################################################
+# Recall the computation steps we took to get here:
+# 
+# .. code-block:: python
+# 
+#    a = torch.linspace(0., 2. * math.pi, steps=25, requires_grad=True)
+#    b = torch.sin(a)
+#    c = 2 * b
+#    d = c + 1
+#    out = d.sum()
+# 
+# Adding a constant, as we did to compute ``d``, does not change the
+# derivative. That leaves :math:`c = 2 * b = 2 * \sin(a)`, the derivative
+# of which should be :math:`2 * \cos(a)`. Looking at the graph above,
+# that’s just what we see.
+# 
+# Be aware that only *leaf nodes* of the computation have their gradients
+# computed. If you tried, for example, ``print(c.grad)`` you’d get back
+# ``None``. In this simple example, only the input is a leaf node, so only
+# it has gradients computed.
+# 
+# Autograd in Training
+# --------------------
+# 
+# We’ve had a brief look at how autograd works, but how does it look when
+# it’s used for its intended purpose? Let’s define a small model and
+# examine how it changes after a single training batch. First, define a
+# few constants, our model, and some stand-ins for inputs and outputs:
+# 
+
+BATCH_SIZE = 16
+DIM_IN = 1000
+HIDDEN_SIZE = 100
+DIM_OUT = 10
+
+class TinyModel(torch.nn.Module):
+
+    def __init__(self):
+        super(TinyModel, self).__init__()
+        
+        self.layer1 = torch.nn.Linear(DIM_IN, HIDDEN_SIZE)
+        self.relu = torch.nn.ReLU()
+        self.layer2 = torch.nn.Linear(HIDDEN_SIZE, DIM_OUT)
+    
+    def forward(self, x):
+        x = self.layer1(x)
+        x = self.relu(x)
+        x = self.layer2(x)
+        return x
+    
+some_input = torch.randn(BATCH_SIZE, DIM_IN, requires_grad=False)
+ideal_output = torch.randn(BATCH_SIZE, DIM_OUT, requires_grad=False)
+
+model = TinyModel()
+
+
+##########################################################################
+# One thing you might notice is that we never specify
+# ``requires_grad=True`` for the model’s layers. Within a subclass of
+# ``torch.nn.Module``, it’s assumed that we want to track gradients on the
+# layers’ weights for learning.
+# 
+# If we look at the layers of the model, we can examine the values of the
+# weights, and verify that no gradients have been computed yet:
+# 
+
+print(model.layer2.weight[0][0:10]) # just a small slice
+print(model.layer2.weight.grad)
+
+
+##########################################################################
+# Let’s see how this changes when we run through one training batch. For a
+# loss function, we’ll just use the square of the Euclidean distance
+# between our ``prediction`` and the ``ideal_output``, and we’ll use a
+# basic stochastic gradient descent optimizer.
+# 
+
+optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
+
+prediction = model(some_input)
+
+loss = (ideal_output - prediction).pow(2).sum()
+print(loss)
+
+
+######################################################################
+# Now, let’s call ``loss.backward()`` and see what happens:
+# 
+
+loss.backward()
+print(model.layer2.weight[0][0:10])
+print(model.layer2.weight.grad[0][0:10])
+
+
+########################################################################
+# We can see that the gradients have been computed for each learning
+# weight, but the weights remain unchanged, because we haven’t run the
+# optimizer yet. The optimizer is responsible for updating model weights
+# based on the computed gradients.
+# 
+
+optimizer.step()
+print(model.layer2.weight[0][0:10])
+print(model.layer2.weight.grad[0][0:10])
+
+
+######################################################################
+# You should see that ``layer2``\ ’s weights have changed.
+# 
+# One important thing about the process: After calling
+# ``optimizer.step()``, you need to call ``optimizer.zero_grad()``, or
+# else every time you run ``loss.backward()``, the gradients on the
+# learning weights will accumulate:
+# 
+
+print(model.layer2.weight.grad[0][0:10])
+
+for i in range(0, 5):
+    prediction = model(some_input)
+    loss = (ideal_output - prediction).pow(2).sum()
+    loss.backward()
+    
+print(model.layer2.weight.grad[0][0:10])
+
+optimizer.zero_grad(set_to_none=False)
+
+print(model.layer2.weight.grad[0][0:10])
+
+
+#########################################################################
+# After running the cell above, you should see that after running
+# ``loss.backward()`` multiple times, the magnitudes of most of the
+# gradients will be much larger. Failing to zero the gradients before
+# running your next training batch will cause the gradients to blow up in
+# this manner, causing incorrect and unpredictable learning results.
+# 
+# Turning Autograd Off and On
+# ---------------------------
+# 
+# There are situations where you will need fine-grained control over
+# whether autograd is enabled. There are multiple ways to do this,
+# depending on the situation.
+# 
+# The simplest is to change the ``requires_grad`` flag on a tensor
+# directly:
+# 
+
+a = torch.ones(2, 3, requires_grad=True)
+print(a)
+
+b1 = 2 * a
+print(b1)
+
+a.requires_grad = False
+b2 = 2 * a
+print(b2)
+
+
+##########################################################################
+# In the cell above, we see that ``b1`` has a ``grad_fn`` (i.e., a traced
+# computation history), which is what we expect, since it was derived from
+# a tensor, ``a``, that had autograd turned on. When we turn off autograd
+# explicitly with ``a.requires_grad = False``, computation history is no
+# longer tracked, as we see when we compute ``b2``.
+# 
+# If you only need autograd turned off temporarily, a better way is to use
+# the ``torch.no_grad()``:
+# 
+
+a = torch.ones(2, 3, requires_grad=True) * 2
+b = torch.ones(2, 3, requires_grad=True) * 3
+
+c1 = a + b
+print(c1)
+
+with torch.no_grad():
+    c2 = a + b
+
+print(c2)
+
+c3 = a * b
+print(c3)
+
+
+##########################################################################
+# ``torch.no_grad()`` can also be used as a function or method decorator:
+# 
+
+def add_tensors1(x, y):
+    return x + y
+
+@torch.no_grad()
+def add_tensors2(x, y):
+    return x + y
+
+
+a = torch.ones(2, 3, requires_grad=True) * 2
+b = torch.ones(2, 3, requires_grad=True) * 3
+
+c1 = add_tensors1(a, b)
+print(c1)
+
+c2 = add_tensors2(a, b)
+print(c2)
+
+
+##########################################################################
+# There’s a corresponding context manager, ``torch.enable_grad()``, for
+# turning autograd on when it isn’t already. It may also be used as a
+# decorator.
+# 
+# Finally, you may have a tensor that requires gradient tracking, but you
+# want a copy that does not. For this we have the ``Tensor`` object’s
+# ``detach()`` method - it creates a copy of the tensor that is *detached*
+# from the computation history:
+# 
+
+x = torch.rand(5, requires_grad=True)
+y = x.detach()
+
+print(x)
+print(y)
+
+
+#########################################################################
+# We did this above when we wanted to graph some of our tensors. This is
+# because ``matplotlib`` expects a NumPy array as input, and the implicit
+# conversion from a PyTorch tensor to a NumPy array is not enabled for
+# tensors with requires_grad=True. Making a detached copy lets us move
+# forward.
+# 
+# Autograd and In-place Operations
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# 
+# In every example in this notebook so far, we’ve used variables to
+# capture the intermediate values of a computation. Autograd needs these
+# intermediate values to perform gradient computations. *For this reason,
+# you must be careful about using in-place operations when using
+# autograd.* Doing so can destroy information you need to compute
+# derivatives in the ``backward()`` call. PyTorch will even stop you if
+# you attempt an in-place operation on leaf variable that requires
+# autograd, as shown below.
+# 
+# .. note::
+#     The following code cell throws a runtime error. This is expected.
+# 
+#    .. code-block:: python
+#
+#       a = torch.linspace(0., 2. * math.pi, steps=25, requires_grad=True)
+#       torch.sin_(a)
+#
+
+#########################################################################
+# Autograd Profiler
+# -----------------
+# 
+# Autograd tracks every step of your computation in detail. Such a
+# computation history, combined with timing information, would make a
+# handy profiler - and autograd has that feature baked in. Here’s a quick
+# example usage:
+# 
+
+device = torch.device('cpu')
+run_on_gpu = False
+if torch.cuda.is_available():
+    device = torch.device('cuda')
+    run_on_gpu = True
+    
+x = torch.randn(2, 3, requires_grad=True)
+y = torch.rand(2, 3, requires_grad=True)
+z = torch.ones(2, 3, requires_grad=True)
+
+with torch.autograd.profiler.profile(use_cuda=run_on_gpu) as prf:
+    for _ in range(1000):
+        z = (z / x) * y
+        
+print(prf.key_averages().table(sort_by='self_cpu_time_total'))
+
+
+##########################################################################
+# The profiler can also label individual sub-blocks of code, break out the
+# data by input tensor shape, and export data as a Chrome tracing tools
+# file. For full details of the API, see the
+# `documentation <https://pytorch.org/docs/stable/autograd.html#profiler>`__.
+# 
+# Advanced Topic: More Autograd Detail and the High-Level API
+# -----------------------------------------------------------
+# 
+# If you have a function with an n-dimensional input and m-dimensional
+# output, :math:`\vec{y}=f(\vec{x})`, the complete gradient is a matrix of
+# the derivative of every output with respect to every input, called the
+# *Jacobian:*
+# 
+# .. math::
+#
+#      J
+#      =
+#      \left(\begin{array}{ccc}
+#      \frac{\partial y_{1}}{\partial x_{1}} & \cdots & \frac{\partial y_{1}}{\partial x_{n}}\\
+#      \vdots & \ddots & \vdots\\
+#      \frac{\partial y_{m}}{\partial x_{1}} & \cdots & \frac{\partial y_{m}}{\partial x_{n}}
+#      \end{array}\right)
+# 
+# If you have a second function, :math:`l=g\left(\vec{y}\right)` that
+# takes m-dimensional input (that is, the same dimensionality as the
+# output above), and returns a scalar output, you can express its
+# gradients with respect to :math:`\vec{y}` as a column vector,
+# :math:`v=\left(\begin{array}{ccc}\frac{\partial l}{\partial y_{1}} & \cdots & \frac{\partial l}{\partial y_{m}}\end{array}\right)^{T}`
+# - which is really just a one-column Jacobian.
+# 
+# More concretely, imagine the first function as your PyTorch model (with
+# potentially many inputs and many outputs) and the second function as a
+# loss function (with the model’s output as input, and the loss value as
+# the scalar output).
+# 
+# If we multiply the first function’s Jacobian by the gradient of the
+# second function, and apply the chain rule, we get:
+# 
+# .. math::
+#
+#    J^{T}\cdot v=\left(\begin{array}{ccc}
+#    \frac{\partial y_{1}}{\partial x_{1}} & \cdots & \frac{\partial y_{m}}{\partial x_{1}}\\
+#    \vdots & \ddots & \vdots\\
+#    \frac{\partial y_{1}}{\partial x_{n}} & \cdots & \frac{\partial y_{m}}{\partial x_{n}}
+#    \end{array}\right)\left(\begin{array}{c}
+#    \frac{\partial l}{\partial y_{1}}\\
+#    \vdots\\
+#    \frac{\partial l}{\partial y_{m}}
+#    \end{array}\right)=\left(\begin{array}{c}
+#    \frac{\partial l}{\partial x_{1}}\\
+#    \vdots\\
+#    \frac{\partial l}{\partial x_{n}}
+#    \end{array}\right)
+# 
+# Note: You could also use the equivalent operation :math:`v^{T}\cdot J`,
+# and get back a row vector.
+# 
+# The resulting column vector is the *gradient of the second function with
+# respect to the inputs of the first* - or in the case of our model and
+# loss function, the gradient of the loss with respect to the model
+# inputs.
+# 
+# **``torch.autograd`` is an engine for computing these products.** This
+# is how we accumulate the gradients over the learning weights during the
+# backward pass.
+# 
+# For this reason, the ``backward()`` call can *also* take an optional
+# vector input. This vector represents a set of gradients over the tensor,
+# which are multiplied by the Jacobian of the autograd-traced tensor that
+# precedes it. Let’s try a specific example with a small vector:
+# 
+
+x = torch.randn(3, requires_grad=True)
+
+y = x * 2
+while y.data.norm() < 1000:
+    y = y * 2
+
+print(y)
+
+
+##########################################################################
+# If we tried to call ``y.backward()`` now, we’d get a runtime error and a
+# message that gradients can only be *implicitly* computed for scalar
+# outputs. For a multi-dimensional output, autograd expects us to provide
+# gradients for those three outputs that it can multiply into the
+# Jacobian:
+# 
+
+v = torch.tensor([0.1, 1.0, 0.0001], dtype=torch.float) # stand-in for gradients
+y.backward(v)
+
+print(x.grad)
+
+
+##########################################################################
+# (Note that the output gradients are all related to powers of two - which
+# we’d expect from a repeated doubling operation.)
+# 
+# The High-Level API
+# ~~~~~~~~~~~~~~~~~~
+# 
+# There is an API on autograd that gives you direct access to important
+# differential matrix and vector operations. In particular, it allows you
+# to calculate the Jacobian and the *Hessian* matrices of a particular
+# function for particular inputs. (The Hessian is like the Jacobian, but
+# expresses all partial *second* derivatives.) It also provides methods
+# for taking vector products with these matrices.
+# 
+# Let’s take the Jacobian of a simple function, evaluated for a 2
+# single-element inputs:
+# 
+
+def exp_adder(x, y):
+    return 2 * x.exp() + 3 * y
+
+inputs = (torch.rand(1), torch.rand(1)) # arguments for the function
+print(inputs)
+torch.autograd.functional.jacobian(exp_adder, inputs)
+
+
+########################################################################
+# If you look closely, the first output should equal :math:`2e^x` (since
+# the derivative of :math:`e^x` is :math:`e^x`), and the second value
+# should be 3.
+# 
+# You can, of course, do this with higher-order tensors:
+# 
+
+inputs = (torch.rand(3), torch.rand(3)) # arguments for the function
+print(inputs)
+torch.autograd.functional.jacobian(exp_adder, inputs)
+
+
+#########################################################################
+# The ``torch.autograd.functional.hessian()`` method works identically
+# (assuming your function is twice differentiable), but returns a matrix
+# of all second derivatives.
+# 
+# There is also a function to directly compute the vector-Jacobian
+# product, if you provide the vector:
+# 
+
+def do_some_doubling(x):
+    y = x * 2
+    while y.data.norm() < 1000:
+        y = y * 2
+    return y
+
+inputs = torch.randn(3)
+my_gradients = torch.tensor([0.1, 1.0, 0.0001])
+torch.autograd.functional.vjp(do_some_doubling, inputs, v=my_gradients)
+
+
+##############################################################################
+# The ``torch.autograd.functional.jvp()`` method performs the same matrix
+# multiplication as ``vjp()`` with the operands reversed. The ``vhp()``
+# and ``hvp()`` methods do the same for a vector-Hessian product.
+# 
+# For more information, including performance notes on the `docs for the
+# functional
+# API <https://pytorch.org/docs/stable/autograd.html#functional-higher-level-api>`__
+# 
diff --git a/beginner_source/introyt/captumyt.py b/beginner_source/introyt/captumyt.py
new file mode 100644
index 00000000000..abf2391d254
--- /dev/null
+++ b/beginner_source/introyt/captumyt.py
@@ -0,0 +1,499 @@
+"""
+`Introduction <introyt1_tutorial.html>`_ ||
+`Tensors <tensors_deeper_tutorial.html>`_ ||
+`Autograd <autogradyt_tutorial.html>`_ ||
+`Building Models <modelsyt_tutorial.html>`_ ||
+`TensorBoard Support <tensorboardyt_tutorial.html>`_ ||
+`Training Models <trainingyt.html>`_ ||
+**Model Understanding**
+
+Model Understanding with Captum
+===============================
+
+Follow along with the video below or on `youtube <https://www.youtube.com/watch?v=Am2EF9CLu-g>`__. Download the notebook and corresponding files
+`here <https://pytorch-tutorial-assets.s3.amazonaws.com/youtube-series/video7.zip>`__.
+
+.. raw:: html
+
+   <div style="margin-top:10px; margin-bottom:10px;">
+     <iframe width="560" height="315" src="https://www.youtube.com/embed/Am2EF9CLu-g" frameborder="0" allow="accelerometer; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
+   </div>
+
+`Captum <https://captum.ai/>`__ (“comprehension” in Latin) is an open
+source, extensible library for model interpretability built on PyTorch.
+
+With the increase in model complexity and the resulting lack of
+transparency, model interpretability methods have become increasingly
+important. Model understanding is both an active area of research as
+well as an area of focus for practical applications across industries
+using machine learning. Captum provides state-of-the-art algorithms,
+including Integrated Gradients, to provide researchers and developers
+with an easy way to understand which features are contributing to a
+model’s output.
+
+Full documentation, an API reference, and a suite of tutorials on
+specific topics are available at the `captum.ai <https://captum.ai/>`__
+website.
+
+Introduction
+------------
+
+Captum’s approach to model interpretability is in terms of
+*attributions.* There are three kinds of attributions available in
+Captum:
+
+-  **Feature Attribution** seeks to explain a particular output in terms
+   of features of the input that generated it. Explaining whether a
+   movie review was positive or negative in terms of certain words in
+   the review is an example of feature attribution.
+-  **Layer Attribution** examines the activity of a model’s hidden layer
+   subsequent to a particular input. Examining the spatially-mapped
+   output of a convolutional layer in response to an input image in an
+   example of layer attribution.
+-  **Neuron Attribution** is analagous to layer attribution, but focuses
+   on the activity of a single neuron.
+
+In this interactive notebook, we’ll look at Feature Attribution and
+Layer Attribution.
+
+Each of the three attribution types has multiple **attribution
+algorithms** associated with it. Many attribution algorithms fall into
+two broad categories:
+
+-  **Gradient-based algorithms** calculate the backward gradients of a
+   model output, layer output, or neuron activation with respect to the
+   input. **Integrated Gradients** (for features), **Layer Gradient \*
+   Activation**, and **Neuron Conductance** are all gradient-based
+   algorithms.
+-  **Perturbation-based algorithms** examine the changes in the output
+   of a model, layer, or neuron in response to changes in the input. The
+   input perturbations may be directed or random. **Occlusion,**
+   **Feature Ablation,** and **Feature Permutation** are all
+   perturbation-based algorithms.
+
+We’ll be examining algorithms of both types below.
+
+Especially where large models are involved, it can be valuable to
+visualize attribution data in ways that relate it easily to the input
+features being examined. While it is certainly possible to create your
+own visualizations with Matplotlib, Plotly, or similar tools, Captum
+offers enhanced tools specific to its attributions:
+
+-  The ``captum.attr.visualization`` module (imported below as ``viz``)
+   provides helpful functions for visualizing attributions related to
+   images.
+-  **Captum Insights** is an easy-to-use API on top of Captum that
+   provides a visualization widget with ready-made visualizations for
+   image, text, and arbitrary model types.
+
+Both of these visualization toolsets will be demonstrated in this
+notebook. The first few examples will focus on computer vision use
+cases, but the Captum Insights section at the end will demonstrate
+visualization of attributions in a multi-model, visual
+question-and-answer model.
+
+Installation
+------------
+
+Before you get started, you need to have a Python environment with:
+
+-  Python version 3.6 or higher
+-  For the Captum Insights example, Flask 1.1 or higher and Flask-Compress
+   (the latest version is recommended)
+-  PyTorch version 1.2 or higher (the latest version is recommended)
+-  TorchVision version 0.6 or higher (the latest version is recommended)
+-  Captum (the latest version is recommended)
+-  Matplotlib version 3.3.4, since Captum currently uses a Matplotlib
+   function whose arguments have been renamed in later versions
+
+To install Captum in an Anaconda or pip virtual environment, use the
+appropriate command for your environment below:
+
+With ``conda``:
+
+.. code-block:: sh
+
+    conda install pytorch torchvision captum flask-compress matplotlib=3.3.4 -c pytorch
+
+With ``pip``:
+
+.. code-block:: sh
+
+    pip install torch torchvision captum matplotlib==3.3.4 Flask-Compress
+
+Restart this notebook in the environment you set up, and you’re ready to
+go!
+
+
+A First Example
+---------------
+ 
+To start, let’s take a simple, visual example. We’ll start with a ResNet
+model pretrained on the ImageNet dataset. We’ll get a test input, and
+use different **Feature Attribution** algorithms to examine how the
+input images affect the output, and see a helpful visualization of this
+input attribution map for some test images.
+ 
+First, some imports: 
+
+"""
+
+import torch
+import torch.nn.functional as F
+import torchvision.transforms as transforms
+import torchvision.models as models
+
+import captum
+from captum.attr import IntegratedGradients, Occlusion, LayerGradCam, LayerAttribution
+from captum.attr import visualization as viz
+
+import os, sys
+import json
+
+import numpy as np
+from PIL import Image
+import matplotlib.pyplot as plt
+from matplotlib.colors import LinearSegmentedColormap
+
+
+#########################################################################
+# Now we’ll use the TorchVision model library to download a pretrained
+# ResNet. Since we’re not training, we’ll place it in evaluation mode for
+# now.
+# 
+
+model = models.resnet18(weights='IMAGENET1K_V1')
+model = model.eval()
+
+
+#######################################################################
+# The place where you got this interactive notebook should also have an
+# ``img`` folder with a file ``cat.jpg`` in it.
+# 
+
+test_img = Image.open('img/cat.jpg')
+test_img_data = np.asarray(test_img)
+plt.imshow(test_img_data)
+plt.show()
+
+
+##########################################################################
+# Our ResNet model was trained on the ImageNet dataset, and expects images
+# to be of a certain size, with the channel data normalized to a specific
+# range of values. We’ll also pull in the list of human-readable labels
+# for the categories our model recognizes - that should be in the ``img``
+# folder as well.
+# 
+
+# model expects 224x224 3-color image
+transform = transforms.Compose([
+ transforms.Resize(224),
+ transforms.CenterCrop(224),
+ transforms.ToTensor()
+])
+
+# standard ImageNet normalization
+transform_normalize = transforms.Normalize(
+     mean=[0.485, 0.456, 0.406],
+     std=[0.229, 0.224, 0.225]
+ )
+
+transformed_img = transform(test_img)
+input_img = transform_normalize(transformed_img)
+input_img = input_img.unsqueeze(0) # the model requires a dummy batch dimension
+
+labels_path = 'img/imagenet_class_index.json'
+with open(labels_path) as json_data:
+    idx_to_labels = json.load(json_data)
+
+
+######################################################################
+# Now, we can ask the question: What does our model think this image
+# represents?
+# 
+
+output = model(input_img)
+output = F.softmax(output, dim=1)
+prediction_score, pred_label_idx = torch.topk(output, 1)
+pred_label_idx.squeeze_()
+predicted_label = idx_to_labels[str(pred_label_idx.item())][1]
+print('Predicted:', predicted_label, '(', prediction_score.squeeze().item(), ')')
+
+
+######################################################################
+# We’ve confirmed that ResNet thinks our image of a cat is, in fact, a
+# cat. But *why* does the model think this is an image of a cat?
+# 
+# For the answer to that, we turn to Captum.
+# 
+
+
+##########################################################################
+# Feature Attribution with Integrated Gradients
+# ---------------------------------------------
+# 
+# **Feature attribution** attributes a particular output to features of
+# the input. It uses a specific input - here, our test image - to generate
+# a map of the relative importance of each input feature to a particular
+# output feature.
+# 
+# `Integrated
+# Gradients <https://captum.ai/api/integrated_gradients.html>`__ is one of
+# the feature attribution algorithms available in Captum. Integrated
+# Gradients assigns an importance score to each input feature by
+# approximating the integral of the gradients of the model’s output with
+# respect to the inputs.
+# 
+# In our case, we’re going to be taking a specific element of the output
+# vector - that is, the one indicating the model’s confidence in its
+# chosen category - and use Integrated Gradients to understand what parts
+# of the input image contributed to this output.
+# 
+# Once we have the importance map from Integrated Gradients, we’ll use the
+# visualization tools in Captum to give a helpful representation of the
+# importance map. Captum’s ``visualize_image_attr()`` function provides a
+# variety of options for customizing display of your attribution data.
+# Here, we pass in a custom Matplotlib color map.
+# 
+# Running the cell with the ``integrated_gradients.attribute()`` call will
+# usually take a minute or two.
+# 
+
+# Initialize the attribution algorithm with the model
+integrated_gradients = IntegratedGradients(model)
+
+# Ask the algorithm to attribute our output target to 
+attributions_ig = integrated_gradients.attribute(input_img, target=pred_label_idx, n_steps=200)
+
+# Show the original image for comparison
+_ = viz.visualize_image_attr(None, np.transpose(transformed_img.squeeze().cpu().detach().numpy(), (1,2,0)), 
+                      method="original_image", title="Original Image")
+
+default_cmap = LinearSegmentedColormap.from_list('custom blue', 
+                                                 [(0, '#ffffff'),
+                                                  (0.25, '#0000ff'),
+                                                  (1, '#0000ff')], N=256)
+
+_ = viz.visualize_image_attr(np.transpose(attributions_ig.squeeze().cpu().detach().numpy(), (1,2,0)),
+                             np.transpose(transformed_img.squeeze().cpu().detach().numpy(), (1,2,0)),
+                             method='heat_map',
+                             cmap=default_cmap,
+                             show_colorbar=True,
+                             sign='positive',
+                             title='Integrated Gradients')
+
+
+#######################################################################
+# In the image above, you should see that Integrated Gradients gives us
+# the strongest signal around the cat’s location in the image.
+# 
+
+
+##########################################################################
+# Feature Attribution with Occlusion
+# ----------------------------------
+# 
+# Gradient-based attribution methods help to understand the model in terms
+# of directly computing out the output changes with respect to the input.
+# *Perturbation-based attribution* methods approach this more directly, by
+# introducing changes to the input to measure the effect on the output.
+# `Occlusion <https://captum.ai/api/occlusion.html>`__ is one such method.
+# It involves replacing sections of the input image, and examining the
+# effect on the output signal.
+# 
+# Below, we set up Occlusion attribution. Similarly to configuring a
+# convolutional neural network, you can specify the size of the target
+# region, and a stride length to determine the spacing of individual
+# measurements. We’ll visualize the output of our Occlusion attribution
+# with ``visualize_image_attr_multiple()``, showing heat maps of both
+# positive and negative attribution by region, and by masking the original
+# image with the positive attribution regions. The masking gives a very
+# instructive view of what regions of our cat photo the model found to be
+# most “cat-like”.
+# 
+
+occlusion = Occlusion(model)
+
+attributions_occ = occlusion.attribute(input_img,
+                                       target=pred_label_idx,
+                                       strides=(3, 8, 8),
+                                       sliding_window_shapes=(3,15, 15),
+                                       baselines=0)
+
+
+_ = viz.visualize_image_attr_multiple(np.transpose(attributions_occ.squeeze().cpu().detach().numpy(), (1,2,0)),
+                                      np.transpose(transformed_img.squeeze().cpu().detach().numpy(), (1,2,0)),
+                                      ["original_image", "heat_map", "heat_map", "masked_image"],
+                                      ["all", "positive", "negative", "positive"],
+                                      show_colorbar=True,
+                                      titles=["Original", "Positive Attribution", "Negative Attribution", "Masked"],
+                                      fig_size=(18, 6)
+                                     )
+
+
+######################################################################
+# Again, we see greater significance placed on the region of the image
+# that contains the cat.
+# 
+
+
+#########################################################################
+# Layer Attribution with Layer GradCAM
+# ------------------------------------
+# 
+# **Layer Attribution** allows you to attribute the activity of hidden
+# layers within your model to features of your input. Below, we’ll use a
+# layer attribution algorithm to examine the activity of one of the
+# convolutional layers within our model.
+# 
+# GradCAM computes the gradients of the target output with respect to the
+# given layer, averages for each output channel (dimension 2 of output),
+# and multiplies the average gradient for each channel by the layer
+# activations. The results are summed over all channels. GradCAM is
+# designed for convnets; since the activity of convolutional layers often
+# maps spatially to the input, GradCAM attributions are often upsampled
+# and used to mask the input.
+# 
+# Layer attribution is set up similarly to input attribution, except that
+# in addition to the model, you must specify a hidden layer within the
+# model that you wish to examine. As above, when we call ``attribute()``,
+# we specify the target class of interest.
+# 
+
+layer_gradcam = LayerGradCam(model, model.layer3[1].conv2)
+attributions_lgc = layer_gradcam.attribute(input_img, target=pred_label_idx)
+
+_ = viz.visualize_image_attr(attributions_lgc[0].cpu().permute(1,2,0).detach().numpy(),
+                             sign="all",
+                             title="Layer 3 Block 1 Conv 2")
+
+
+##########################################################################
+# We’ll use the convenience method ``interpolate()`` in the
+# `LayerAttribution <https://captum.ai/api/base_classes.html?highlight=layerattribution#captum.attr.LayerAttribution>`__
+# base class to upsample this attribution data for comparison to the input
+# image.
+# 
+
+upsamp_attr_lgc = LayerAttribution.interpolate(attributions_lgc, input_img.shape[2:])
+
+print(attributions_lgc.shape)
+print(upsamp_attr_lgc.shape)
+print(input_img.shape)
+
+_ = viz.visualize_image_attr_multiple(upsamp_attr_lgc[0].cpu().permute(1,2,0).detach().numpy(),
+                                      transformed_img.permute(1,2,0).numpy(),
+                                      ["original_image","blended_heat_map","masked_image"],
+                                      ["all","positive","positive"],
+                                      show_colorbar=True,
+                                      titles=["Original", "Positive Attribution", "Masked"],
+                                      fig_size=(18, 6))
+
+
+#######################################################################
+# Visualizations such as this can give you novel insights into how your
+# hidden layers respond to your input.
+# 
+
+
+##########################################################################
+# Visualization with Captum Insights
+# ----------------------------------
+# 
+# Captum Insights is an interpretability visualization widget built on top
+# of Captum to facilitate model understanding. Captum Insights works
+# across images, text, and other features to help users understand feature
+# attribution. It allows you to visualize attribution for multiple
+# input/output pairs, and provides visualization tools for image, text,
+# and arbitrary data.
+# 
+# In this section of the notebook, we’ll visualize multiple image
+# classification inferences with Captum Insights.
+# 
+# First, let’s gather some image and see what the model thinks of them.
+# For variety, we’ll take our cat, a teapot, and a trilobite fossil:
+# 
+
+imgs = ['img/cat.jpg', 'img/teapot.jpg', 'img/trilobite.jpg']
+
+for img in imgs:
+    img = Image.open(img)
+    transformed_img = transform(img)
+    input_img = transform_normalize(transformed_img)
+    input_img = input_img.unsqueeze(0) # the model requires a dummy batch dimension
+
+    output = model(input_img)
+    output = F.softmax(output, dim=1)
+    prediction_score, pred_label_idx = torch.topk(output, 1)
+    pred_label_idx.squeeze_()
+    predicted_label = idx_to_labels[str(pred_label_idx.item())][1]
+    print('Predicted:', predicted_label, '/', pred_label_idx.item(), ' (', prediction_score.squeeze().item(), ')')
+
+
+##########################################################################
+# …and it looks like our model is identifying them all correctly - but of
+# course, we want to dig deeper. For that we’ll use the Captum Insights
+# widget, which we configure with an ``AttributionVisualizer`` object,
+# imported below. The ``AttributionVisualizer`` expects batches of data,
+# so we’ll bring in Captum’s ``Batch`` helper class. And we’ll be looking
+# at images specifically, so well also import ``ImageFeature``.
+# 
+# We configure the ``AttributionVisualizer`` with the following arguments:
+# 
+# -  An array of models to be examined (in our case, just the one)
+# -  A scoring function, which allows Captum Insights to pull out the
+#    top-k predictions from a model
+# -  An ordered, human-readable list of classes our model is trained on
+# -  A list of features to look for - in our case, an ``ImageFeature``
+# -  A dataset, which is an iterable object returning batches of inputs
+#    and labels - just like you’d use for training
+# 
+
+from captum.insights import AttributionVisualizer, Batch
+from captum.insights.attr_vis.features import ImageFeature
+
+# Baseline is all-zeros input - this may differ depending on your data
+def baseline_func(input):
+    return input * 0
+
+# merging our image transforms from above
+def full_img_transform(input):
+    i = Image.open(input)
+    i = transform(i)
+    i = transform_normalize(i)
+    i = i.unsqueeze(0)
+    return i
+
+
+input_imgs = torch.cat(list(map(lambda i: full_img_transform(i), imgs)), 0)
+
+visualizer = AttributionVisualizer(
+    models=[model],
+    score_func=lambda o: torch.nn.functional.softmax(o, 1),
+    classes=list(map(lambda k: idx_to_labels[k][1], idx_to_labels.keys())),
+    features=[
+        ImageFeature(
+            "Photo",
+            baseline_transforms=[baseline_func],
+            input_transforms=[],
+        )
+    ],
+    dataset=[Batch(input_imgs, labels=[282,849,69])]
+)
+
+
+#########################################################################
+# Note that running the cell above didn’t take much time at all, unlike
+# our attributions above. That’s because Captum Insights lets you
+# configure different attribution algorithms in a visual widget, after
+# which it will compute and display the attributions. *That* process will
+# take a few minutes.
+# 
+# Running the cell below will render the Captum Insights widget. You can
+# then choose attributions methods and their arguments, filter model
+# responses based on predicted class or prediction correctness, see the
+# model’s predictions with associated probabilities, and view heatmaps of
+# the attribution compared with the original image.
+# 
+
+visualizer.render()
diff --git a/beginner_source/introyt/introyt1_tutorial.py b/beginner_source/introyt/introyt1_tutorial.py
new file mode 100644
index 00000000000..c01befb40cc
--- /dev/null
+++ b/beginner_source/introyt/introyt1_tutorial.py
@@ -0,0 +1,631 @@
+"""
+**Introduction** ||
+`Tensors <tensors_deeper_tutorial.html>`_ ||
+`Autograd <autogradyt_tutorial.html>`_ ||
+`Building Models <modelsyt_tutorial.html>`_ ||
+`TensorBoard Support <tensorboardyt_tutorial.html>`_ ||
+`Training Models <trainingyt.html>`_ ||
+`Model Understanding <captumyt.html>`_
+
+Introduction to PyTorch
+=======================
+
+Follow along with the video below or on `youtube <https://www.youtube.com/watch?v=IC0_FRiX-sw>`__.
+
+.. raw:: html
+
+   <div style="margin-top:10px; margin-bottom:10px;">
+     <iframe width="560" height="315" src="https://www.youtube.com/embed/IC0_FRiX-sw" frameborder="0" allow="accelerometer; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
+   </div>
+
+PyTorch Tensors
+---------------
+
+Follow along with the video beginning at `03:50 <https://www.youtube.com/watch?v=IC0_FRiX-sw&t=230s>`__.
+
+First, we’ll import pytorch.
+
+"""
+
+import torch
+
+######################################################################
+# Let’s see a few basic tensor manipulations. First, just a few of the
+# ways to create tensors:
+# 
+
+z = torch.zeros(5, 3)
+print(z)
+print(z.dtype)
+
+
+#########################################################################
+# Above, we create a 5x3 matrix filled with zeros, and query its datatype
+# to find out that the zeros are 32-bit floating point numbers, which is
+# the default PyTorch.
+# 
+# What if you wanted integers instead? You can always override the
+# default:
+# 
+
+i = torch.ones((5, 3), dtype=torch.int16)
+print(i)
+
+
+######################################################################
+# You can see that when we do change the default, the tensor helpfully
+# reports this when printed.
+# 
+# It’s common to initialize learning weights randomly, often with a
+# specific seed for the PRNG for reproducibility of results:
+# 
+
+torch.manual_seed(1729)
+r1 = torch.rand(2, 2)
+print('A random tensor:')
+print(r1)
+
+r2 = torch.rand(2, 2)
+print('\nA different random tensor:')
+print(r2) # new values
+
+torch.manual_seed(1729)
+r3 = torch.rand(2, 2)
+print('\nShould match r1:')
+print(r3) # repeats values of r1 because of re-seed
+
+
+#######################################################################
+# PyTorch tensors perform arithmetic operations intuitively. Tensors of
+# similar shapes may be added, multiplied, etc. Operations with scalars
+# are distributed over the tensor:
+# 
+
+ones = torch.ones(2, 3)
+print(ones)
+
+twos = torch.ones(2, 3) * 2 # every element is multiplied by 2
+print(twos)
+
+threes = ones + twos       # addition allowed because shapes are similar
+print(threes)              # tensors are added element-wise
+print(threes.shape)        # this has the same dimensions as input tensors
+
+r1 = torch.rand(2, 3)
+r2 = torch.rand(3, 2)
+# uncomment this line to get a runtime error
+# r3 = r1 + r2
+
+
+######################################################################
+# Here’s a small sample of the mathematical operations available:
+# 
+
+r = (torch.rand(2, 2) - 0.5) * 2 # values between -1 and 1
+print('A random matrix, r:')
+print(r)
+
+# Common mathematical operations are supported:
+print('\nAbsolute value of r:')
+print(torch.abs(r))
+
+# ...as are trigonometric functions:
+print('\nInverse sine of r:')
+print(torch.asin(r))
+
+# ...and linear algebra operations like determinant and singular value decomposition
+print('\nDeterminant of r:')
+print(torch.det(r))
+print('\nSingular value decomposition of r:')
+print(torch.svd(r))
+
+# ...and statistical and aggregate operations:
+print('\nAverage and standard deviation of r:')
+print(torch.std_mean(r))
+print('\nMaximum value of r:')
+print(torch.max(r))
+
+
+##########################################################################
+# There’s a good deal more to know about the power of PyTorch tensors,
+# including how to set them up for parallel computations on GPU - we’ll be
+# going into more depth in another video.
+# 
+# PyTorch Models
+# --------------
+#
+# Follow along with the video beginning at `10:00 <https://www.youtube.com/watch?v=IC0_FRiX-sw&t=600s>`__.
+#
+# Let’s talk about how we can express models in PyTorch
+#
+
+import torch                     # for all things PyTorch
+import torch.nn as nn            # for torch.nn.Module, the parent object for PyTorch models
+import torch.nn.functional as F  # for the activation function
+
+
+#########################################################################
+# .. figure:: /_static/img/mnist.png
+#    :alt: le-net-5 diagram
+#
+# *Figure: LeNet-5*
+# 
+# Above is a diagram of LeNet-5, one of the earliest convolutional neural
+# nets, and one of the drivers of the explosion in Deep Learning. It was
+# built to read small images of handwritten numbers (the MNIST dataset),
+# and correctly classify which digit was represented in the image.
+# 
+# Here’s the abridged version of how it works:
+# 
+# -  Layer C1 is a convolutional layer, meaning that it scans the input
+#    image for features it learned during training. It outputs a map of
+#    where it saw each of its learned features in the image. This
+#    “activation map” is downsampled in layer S2.
+# -  Layer C3 is another convolutional layer, this time scanning C1’s
+#    activation map for *combinations* of features. It also puts out an
+#    activation map describing the spatial locations of these feature
+#    combinations, which is downsampled in layer S4.
+# -  Finally, the fully-connected layers at the end, F5, F6, and OUTPUT,
+#    are a *classifier* that takes the final activation map, and
+#    classifies it into one of ten bins representing the 10 digits.
+# 
+# How do we express this simple neural network in code?
+# 
+
+class LeNet(nn.Module):
+
+    def __init__(self):
+        super(LeNet, self).__init__()
+        # 1 input image channel (black & white), 6 output channels, 5x5 square convolution
+        # kernel
+        self.conv1 = nn.Conv2d(1, 6, 5)
+        self.conv2 = nn.Conv2d(6, 16, 5)
+        # an affine operation: y = Wx + b
+        self.fc1 = nn.Linear(16 * 5 * 5, 120)  # 5*5 from image dimension
+        self.fc2 = nn.Linear(120, 84)
+        self.fc3 = nn.Linear(84, 10)
+
+    def forward(self, x):
+        # Max pooling over a (2, 2) window
+        x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
+        # If the size is a square you can only specify a single number
+        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
+        x = x.view(-1, self.num_flat_features(x))
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        x = self.fc3(x)
+        return x
+
+    def num_flat_features(self, x):
+        size = x.size()[1:]  # all dimensions except the batch dimension
+        num_features = 1
+        for s in size:
+            num_features *= s
+        return num_features
+
+
+############################################################################
+# Looking over this code, you should be able to spot some structural
+# similarities with the diagram above.
+# 
+# This demonstrates the structure of a typical PyTorch model: 
+#
+# -  It inherits from ``torch.nn.Module`` - modules may be nested - in fact,
+#    even the ``Conv2d`` and ``Linear`` layer classes inherit from
+#    ``torch.nn.Module``.
+# -  A model will have an ``__init__()`` function, where it instantiates
+#    its layers, and loads any data artifacts it might
+#    need (e.g., an NLP model might load a vocabulary).
+# -  A model will have a ``forward()`` function. This is where the actual
+#    computation happens: An input is passed through the network layers
+#    and various functions to generate an output.
+# -  Other than that, you can build out your model class like any other
+#    Python class, adding whatever properties and methods you need to
+#    support your model’s computation.
+# 
+# Let’s instantiate this object and run a sample input through it.
+# 
+
+net = LeNet()
+print(net)                         # what does the object tell us about itself?
+
+input = torch.rand(1, 1, 32, 32)   # stand-in for a 32x32 black & white image
+print('\nImage batch shape:')
+print(input.shape)
+
+output = net(input)                # we don't call forward() directly
+print('\nRaw output:')
+print(output)
+print(output.shape)
+
+
+##########################################################################
+# There are a few important things happening above:
+# 
+# First, we instantiate the ``LeNet`` class, and we print the ``net``
+# object. A subclass of ``torch.nn.Module`` will report the layers it has
+# created and their shapes and parameters. This can provide a handy
+# overview of a model if you want to get the gist of its processing.
+# 
+# Below that, we create a dummy input representing a 32x32 image with 1
+# color channel. Normally, you would load an image tile and convert it to
+# a tensor of this shape.
+# 
+# You may have noticed an extra dimension to our tensor - the *batch
+# dimension.* PyTorch models assume they are working on *batches* of data
+# - for example, a batch of 16 of our image tiles would have the shape
+# ``(16, 1, 32, 32)``. Since we’re only using one image, we create a batch
+# of 1 with shape ``(1, 1, 32, 32)``.
+# 
+# We ask the model for an inference by calling it like a function:
+# ``net(input)``. The output of this call represents the model’s
+# confidence that the input represents a particular digit. (Since this
+# instance of the model hasn’t learned anything yet, we shouldn’t expect
+# to see any signal in the output.) Looking at the shape of ``output``, we
+# can see that it also has a batch dimension, the size of which should
+# always match the input batch dimension. If we had passed in an input
+# batch of 16 instances, ``output`` would have a shape of ``(16, 10)``.
+# 
+# Datasets and Dataloaders
+# ------------------------
+#
+# Follow along with the video beginning at `14:00 <https://www.youtube.com/watch?v=IC0_FRiX-sw&t=840s>`__.
+#
+# Below, we’re going to demonstrate using one of the ready-to-download,
+# open-access datasets from TorchVision, how to transform the images for
+# consumption by your model, and how to use the DataLoader to feed batches
+# of data to your model.
+#
+# The first thing we need to do is transform our incoming images into a
+# PyTorch tensor.
+#
+
+#%matplotlib inline
+
+import torch
+import torchvision
+import torchvision.transforms as transforms
+
+transform = transforms.Compose(
+    [transforms.ToTensor(),
+     transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2470, 0.2435, 0.2616))])
+
+
+##########################################################################
+# Here, we specify two transformations for our input:
+#
+# -  ``transforms.ToTensor()`` converts images loaded by Pillow into 
+#    PyTorch tensors.
+# -  ``transforms.Normalize()`` adjusts the values of the tensor so
+#    that their average is zero and their standard deviation is 1.0. Most
+#    activation functions have their strongest gradients around x = 0, so
+#    centering our data there can speed learning.
+#    The values passed to the transform are the means (first tuple) and the
+#    standard deviations (second tuple) of the rgb values of the images in
+#    the dataset. You can calculate these values yourself by running these
+#    few lines of code::
+#
+#        from torch.utils.data import ConcatDataset
+#        transform = transforms.Compose([transforms.ToTensor()])
+#        trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
+#                                        download=True, transform=transform)
+#
+#        # stack all train images together into a tensor of shape 
+#        # (50000, 3, 32, 32)
+#        x = torch.stack([sample[0] for sample in ConcatDataset([trainset])])
+#           
+#        # get the mean of each channel            
+#        mean = torch.mean(x, dim=(0,2,3)) # tensor([0.4914, 0.4822, 0.4465])
+#        std = torch.std(x, dim=(0,2,3)) # tensor([0.2470, 0.2435, 0.2616])  
+#    
+# 
+# There are many more transforms available, including cropping, centering,
+# rotation, and reflection.
+# 
+# Next, we’ll create an instance of the CIFAR10 dataset. This is a set of
+# 32x32 color image tiles representing 10 classes of objects: 6 of animals
+# (bird, cat, deer, dog, frog, horse) and 4 of vehicles (airplane,
+# automobile, ship, truck):
+# 
+
+trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
+                                        download=True, transform=transform)
+
+
+##########################################################################
+# .. note::
+#      When you run the cell above, it may take a little time for the 
+#      dataset to download.
+# 
+# This is an example of creating a dataset object in PyTorch. Downloadable
+# datasets (like CIFAR-10 above) are subclasses of
+# ``torch.utils.data.Dataset``. ``Dataset`` classes in PyTorch include the
+# downloadable datasets in TorchVision, Torchtext, and TorchAudio, as well
+# as utility dataset classes such as ``torchvision.datasets.ImageFolder``,
+# which will read a folder of labeled images. You can also create your own
+# subclasses of ``Dataset``.
+# 
+# When we instantiate our dataset, we need to tell it a few things:
+#
+# -  The filesystem path to where we want the data to go. 
+# -  Whether or not we are using this set for training; most datasets
+#    will be split into training and test subsets.
+# -  Whether we would like to download the dataset if we haven’t already.
+# -  The transformations we want to apply to the data.
+# 
+# Once your dataset is ready, you can give it to the ``DataLoader``:
+# 
+
+trainloader = torch.utils.data.DataLoader(trainset, batch_size=4,
+                                          shuffle=True, num_workers=2)
+
+
+##########################################################################
+# A ``Dataset`` subclass wraps access to the data, and is specialized to
+# the type of data it’s serving. The ``DataLoader`` knows *nothing* about
+# the data, but organizes the input tensors served by the ``Dataset`` into
+# batches with the parameters you specify.
+# 
+# In the example above, we’ve asked a ``DataLoader`` to give us batches of
+# 4 images from ``trainset``, randomizing their order (``shuffle=True``),
+# and we told it to spin up two workers to load data from disk.
+# 
+# It’s good practice to visualize the batches your ``DataLoader`` serves:
+# 
+
+import matplotlib.pyplot as plt
+import numpy as np
+
+classes = ('plane', 'car', 'bird', 'cat',
+           'deer', 'dog', 'frog', 'horse', 'ship', 'truck')
+
+def imshow(img):
+    img = img / 2 + 0.5     # unnormalize
+    npimg = img.numpy()
+    plt.imshow(np.transpose(npimg, (1, 2, 0)))
+
+
+# get some random training images
+dataiter = iter(trainloader)
+images, labels = next(dataiter)
+
+# show images
+imshow(torchvision.utils.make_grid(images))
+# print labels
+print(' '.join('%5s' % classes[labels[j]] for j in range(4)))
+
+
+########################################################################
+# Running the above cell should show you a strip of four images, and the
+# correct label for each.
+# 
+# Training Your PyTorch Model
+# ---------------------------
+#
+# Follow along with the video beginning at `17:10 <https://www.youtube.com/watch?v=IC0_FRiX-sw&t=1030s>`__.
+#
+# Let’s put all the pieces together, and train a model:
+#
+
+#%matplotlib inline
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+
+import torchvision
+import torchvision.transforms as transforms
+
+import matplotlib
+import matplotlib.pyplot as plt
+import numpy as np
+
+
+#########################################################################
+# First, we’ll need training and test datasets. If you haven’t already,
+# run the cell below to make sure the dataset is downloaded. (It may take
+# a minute.)
+# 
+
+transform = transforms.Compose(
+    [transforms.ToTensor(),
+     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
+
+trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
+                                        download=True, transform=transform)
+trainloader = torch.utils.data.DataLoader(trainset, batch_size=4,
+                                          shuffle=True, num_workers=2)
+
+testset = torchvision.datasets.CIFAR10(root='./data', train=False,
+                                       download=True, transform=transform)
+testloader = torch.utils.data.DataLoader(testset, batch_size=4,
+                                         shuffle=False, num_workers=2)
+
+classes = ('plane', 'car', 'bird', 'cat',
+           'deer', 'dog', 'frog', 'horse', 'ship', 'truck')
+
+
+######################################################################
+# We’ll run our check on the output from ``DataLoader``:
+# 
+
+import matplotlib.pyplot as plt
+import numpy as np
+
+# functions to show an image
+
+
+def imshow(img):
+    img = img / 2 + 0.5     # unnormalize
+    npimg = img.numpy()
+    plt.imshow(np.transpose(npimg, (1, 2, 0)))
+
+
+# get some random training images
+dataiter = iter(trainloader)
+images, labels = next(dataiter)
+
+# show images
+imshow(torchvision.utils.make_grid(images))
+# print labels
+print(' '.join('%5s' % classes[labels[j]] for j in range(4)))
+
+
+##########################################################################
+# This is the model we’ll train. If it looks familiar, that’s because it’s
+# a variant of LeNet - discussed earlier in this video - adapted for
+# 3-color images.
+# 
+
+class Net(nn.Module):
+    def __init__(self):
+        super(Net, self).__init__()
+        self.conv1 = nn.Conv2d(3, 6, 5)
+        self.pool = nn.MaxPool2d(2, 2)
+        self.conv2 = nn.Conv2d(6, 16, 5)
+        self.fc1 = nn.Linear(16 * 5 * 5, 120)
+        self.fc2 = nn.Linear(120, 84)
+        self.fc3 = nn.Linear(84, 10)
+
+    def forward(self, x):
+        x = self.pool(F.relu(self.conv1(x)))
+        x = self.pool(F.relu(self.conv2(x)))
+        x = x.view(-1, 16 * 5 * 5)
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        x = self.fc3(x)
+        return x
+
+
+net = Net()
+
+
+######################################################################
+# The last ingredients we need are a loss function and an optimizer:
+# 
+
+criterion = nn.CrossEntropyLoss()
+optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
+
+
+##########################################################################
+# The loss function, as discussed earlier in this video, is a measure of
+# how far from our ideal output the model’s prediction was. Cross-entropy
+# loss is a typical loss function for classification models like ours.
+# 
+# The **optimizer** is what drives the learning. Here we have created an
+# optimizer that implements *stochastic gradient descent,* one of the more
+# straightforward optimization algorithms. Besides parameters of the
+# algorithm, like the learning rate (``lr``) and momentum, we also pass in
+# ``net.parameters()``, which is a collection of all the learning weights
+# in the model - which is what the optimizer adjusts.
+# 
+# Finally, all of this is assembled into the training loop. Go ahead and
+# run this cell, as it will likely take a few minutes to execute:
+# 
+
+for epoch in range(2):  # loop over the dataset multiple times
+
+    running_loss = 0.0
+    for i, data in enumerate(trainloader, 0):
+        # get the inputs
+        inputs, labels = data
+
+        # zero the parameter gradients
+        optimizer.zero_grad()
+
+        # forward + backward + optimize
+        outputs = net(inputs)
+        loss = criterion(outputs, labels)
+        loss.backward()
+        optimizer.step()
+
+        # print statistics
+        running_loss += loss.item()
+        if i % 2000 == 1999:    # print every 2000 mini-batches
+            print('[%d, %5d] loss: %.3f' %
+                  (epoch + 1, i + 1, running_loss / 2000))
+            running_loss = 0.0
+
+print('Finished Training')
+
+
+########################################################################
+# Here, we are doing only **2 training epochs** (line 1) - that is, two
+# passes over the training dataset. Each pass has an inner loop that
+# **iterates over the training data** (line 4), serving batches of
+# transformed input images and their correct labels.
+# 
+# **Zeroing the gradients** (line 9) is an important step. Gradients are
+# accumulated over a batch; if we do not reset them for every batch, they
+# will keep accumulating, which will provide incorrect gradient values,
+# making learning impossible.
+# 
+# In line 12, we **ask the model for its predictions** on this batch. In
+# the following line (13), we compute the loss - the difference between
+# ``outputs`` (the model prediction) and ``labels`` (the correct output).
+# 
+# In line 14, we do the ``backward()`` pass, and calculate the gradients
+# that will direct the learning.
+# 
+# In line 15, the optimizer performs one learning step - it uses the
+# gradients from the ``backward()`` call to nudge the learning weights in
+# the direction it thinks will reduce the loss.
+# 
+# The remainder of the loop does some light reporting on the epoch number,
+# how many training instances have been completed, and what the collected
+# loss is over the training loop.
+# 
+# **When you run the cell above,** you should see something like this:
+# 
+# .. code-block:: sh
+# 
+#    [1,  2000] loss: 2.235
+#    [1,  4000] loss: 1.940
+#    [1,  6000] loss: 1.713
+#    [1,  8000] loss: 1.573
+#    [1, 10000] loss: 1.507
+#    [1, 12000] loss: 1.442
+#    [2,  2000] loss: 1.378
+#    [2,  4000] loss: 1.364
+#    [2,  6000] loss: 1.349
+#    [2,  8000] loss: 1.319
+#    [2, 10000] loss: 1.284
+#    [2, 12000] loss: 1.267
+#    Finished Training
+# 
+# Note that the loss is monotonically descending, indicating that our
+# model is continuing to improve its performance on the training dataset.
+# 
+# As a final step, we should check that the model is actually doing
+# *general* learning, and not simply “memorizing” the dataset. This is
+# called **overfitting,** and usually indicates that the dataset is too
+# small (not enough examples for general learning), or that the model has
+# more learning parameters than it needs to correctly model the dataset.
+# 
+# This is the reason datasets are split into training and test subsets -
+# to test the generality of the model, we ask it to make predictions on
+# data it hasn’t trained on:
+# 
+
+correct = 0
+total = 0
+with torch.no_grad():
+    for data in testloader:
+        images, labels = data
+        outputs = net(images)
+        _, predicted = torch.max(outputs.data, 1)
+        total += labels.size(0)
+        correct += (predicted == labels).sum().item()
+
+print('Accuracy of the network on the 10000 test images: %d %%' % (
+    100 * correct / total))
+
+
+#########################################################################
+# If you followed along, you should see that the model is roughly 50%
+# accurate at this point. That’s not exactly state-of-the-art, but it’s
+# far better than the 10% accuracy we’d expect from a random output. This
+# demonstrates that some general learning did happen in the model.
+# 
diff --git a/beginner_source/introyt/introyt_index.py b/beginner_source/introyt/introyt_index.py
new file mode 100644
index 00000000000..9ef60574dd9
--- /dev/null
+++ b/beginner_source/introyt/introyt_index.py
@@ -0,0 +1,38 @@
+"""
+`Introduction <introyt1_tutorial.html>`_ ||
+`Tensors <tensors_deeper_tutorial.html>`_ ||
+`Autograd <autogradyt_tutorial.html>`_ ||
+`Building Models <modelsyt_tutorial.html>`_ ||
+`TensorBoard Support <tensorboardyt_tutorial.html>`_ ||
+`Training Models <trainingyt.html>`_ ||
+`Model Understanding <captumyt.html>`_
+
+Introduction to PyTorch - YouTube Series
+========================================
+
+Authors:
+`Brad Heintz <https://github.com/fbbradheintz>`_
+
+This tutorial follows along with the `PyTorch Beginner Series <https://www.youtube.com/playlist?list=PL_lsbAsL_o2CTlGHgMxNrKhzP97BaG9ZN>`_ on YouTube.
+
+`This tutorial assumes a basic familiarity with Python and Deep Learning concepts.`
+
+Running the Tutorial Code
+-------------------------
+You can run this tutorial in a couple of ways:
+
+- **On the cloud**: This is the easiest way to get started! Each section has a Colab link at the top, which opens a notebook with the code in a fully-hosted environment. Pro tip: Use Colab with a GPU runtime to speed up operations *Runtime > Change runtime type > GPU*
+- **Locally**: This option requires you to set up PyTorch and torchvision on your local machine (`installation instructions <https://pytorch.org/get-started/locally/>`_). Download the notebook or copy the code into your favorite IDE.
+
+.. toctree::
+   :maxdepth: 2
+   :hidden:
+
+   introyt1_tutorial
+   tensors_deeper_tutorial
+   autogradyt_tutorial
+   modelsyt_tutorial
+   tensorboardyt_tutorial
+   trainingyt
+   captumyt
+"""
diff --git a/beginner_source/introyt/modelsyt_tutorial.py b/beginner_source/introyt/modelsyt_tutorial.py
new file mode 100644
index 00000000000..61c27d5c543
--- /dev/null
+++ b/beginner_source/introyt/modelsyt_tutorial.py
@@ -0,0 +1,420 @@
+"""
+`Introduction <introyt1_tutorial.html>`_ ||
+`Tensors <tensors_deeper_tutorial.html>`_ ||
+`Autograd <autogradyt_tutorial.html>`_ ||
+**Building Models** ||
+`TensorBoard Support <tensorboardyt_tutorial.html>`_ ||
+`Training Models <trainingyt.html>`_ ||
+`Model Understanding <captumyt.html>`_
+
+Building Models with PyTorch
+============================
+
+Follow along with the video below or on `youtube <https://www.youtube.com/watch?v=OSqIP-mOWOI>`__.
+
+.. raw:: html
+
+   <div style="margin-top:10px; margin-bottom:10px;">
+     <iframe width="560" height="315" src="https://www.youtube.com/embed/OSqIP-mOWOI" frameborder="0" allow="accelerometer; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
+   </div>
+
+``torch.nn.Module`` and ``torch.nn.Parameter``
+----------------------------------------------
+
+In this video, we’ll be discussing some of the tools PyTorch makes
+available for building deep learning networks.
+
+Except for ``Parameter``, the classes we discuss in this video are all
+subclasses of ``torch.nn.Module``. This is the PyTorch base class meant
+to encapsulate behaviors specific to PyTorch Models and their
+components.
+
+One important behavior of ``torch.nn.Module`` is registering parameters.
+If a particular ``Module`` subclass has learning weights, these weights
+are expressed as instances of ``torch.nn.Parameter``. The ``Parameter``
+class is a subclass of ``torch.Tensor``, with the special behavior that
+when they are assigned as attributes of a ``Module``, they are added to
+the list of that modules parameters. These parameters may be accessed
+through the ``parameters()`` method on the ``Module`` class.
+
+As a simple example, here’s a very simple model with two linear layers
+and an activation function. We’ll create an instance of it and ask it to
+report on its parameters:
+
+"""
+
+import torch
+
+class TinyModel(torch.nn.Module):
+    
+    def __init__(self):
+        super(TinyModel, self).__init__()
+        
+        self.linear1 = torch.nn.Linear(100, 200)
+        self.activation = torch.nn.ReLU()
+        self.linear2 = torch.nn.Linear(200, 10)
+        self.softmax = torch.nn.Softmax()
+    
+    def forward(self, x):
+        x = self.linear1(x)
+        x = self.activation(x)
+        x = self.linear2(x)
+        x = self.softmax(x)
+        return x
+
+tinymodel = TinyModel()
+
+print('The model:')
+print(tinymodel)
+
+print('\n\nJust one layer:')
+print(tinymodel.linear2)
+
+print('\n\nModel params:')
+for param in tinymodel.parameters():
+    print(param)
+
+print('\n\nLayer params:')
+for param in tinymodel.linear2.parameters():
+    print(param)
+
+
+#########################################################################
+# This shows the fundamental structure of a PyTorch model: there is an
+# ``__init__()`` method that defines the layers and other components of a
+# model, and a ``forward()`` method where the computation gets done. Note
+# that we can print the model, or any of its submodules, to learn about
+# its structure.
+# 
+# Common Layer Types
+# ------------------
+# 
+# Linear Layers
+# ~~~~~~~~~~~~~
+# 
+# The most basic type of neural network layer is a *linear* or *fully
+# connected* layer. This is a layer where every input influences every
+# output of the layer to a degree specified by the layer’s weights. If a
+# model has *m* inputs and *n* outputs, the weights will be an *m* x *n*
+# matrix. For example:
+# 
+
+lin = torch.nn.Linear(3, 2)
+x = torch.rand(1, 3)
+print('Input:')
+print(x)
+
+print('\n\nWeight and Bias parameters:')
+for param in lin.parameters():
+    print(param)
+
+y = lin(x)
+print('\n\nOutput:')
+print(y)
+
+
+#########################################################################
+# If you do the matrix multiplication of ``x`` by the linear layer’s
+# weights, and add the biases, you’ll find that you get the output vector
+# ``y``.
+# 
+# One other important feature to note: When we checked the weights of our
+# layer with ``lin.weight``, it reported itself as a ``Parameter`` (which
+# is a subclass of ``Tensor``), and let us know that it’s tracking
+# gradients with autograd. This is a default behavior for ``Parameter``
+# that differs from ``Tensor``.
+# 
+# Linear layers are used widely in deep learning models. One of the most
+# common places you’ll see them is in classifier models, which will
+# usually have one or more linear layers at the end, where the last layer
+# will have *n* outputs, where *n* is the number of classes the classifier
+# addresses.
+# 
+# Convolutional Layers
+# ~~~~~~~~~~~~~~~~~~~~
+# 
+# *Convolutional* layers are built to handle data with a high degree of
+# spatial correlation. They are very commonly used in computer vision,
+# where they detect close groupings of features which the compose into
+# higher-level features. They pop up in other contexts too - for example,
+# in NLP applications, where a word’s immediate context (that is, the
+# other words nearby in the sequence) can affect the meaning of a
+# sentence.
+# 
+# We saw convolutional layers in action in LeNet5 in an earlier video:
+# 
+
+import torch.functional as F
+
+
+class LeNet(torch.nn.Module):
+
+    def __init__(self):
+        super(LeNet, self).__init__()
+        # 1 input image channel (black & white), 6 output channels, 5x5 square convolution
+        # kernel
+        self.conv1 = torch.nn.Conv2d(1, 6, 5)
+        self.conv2 = torch.nn.Conv2d(6, 16, 3)
+        # an affine operation: y = Wx + b
+        self.fc1 = torch.nn.Linear(16 * 6 * 6, 120)  # 6*6 from image dimension
+        self.fc2 = torch.nn.Linear(120, 84)
+        self.fc3 = torch.nn.Linear(84, 10)
+
+    def forward(self, x):
+        # Max pooling over a (2, 2) window
+        x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
+        # If the size is a square you can only specify a single number
+        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
+        x = x.view(-1, self.num_flat_features(x))
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        x = self.fc3(x)
+        return x
+
+    def num_flat_features(self, x):
+        size = x.size()[1:]  # all dimensions except the batch dimension
+        num_features = 1
+        for s in size:
+            num_features *= s
+        return num_features
+
+
+##########################################################################
+# Let’s break down what’s happening in the convolutional layers of this
+# model. Starting with ``conv1``:
+# 
+# -  LeNet5 is meant to take in a 1x32x32 black & white image. **The first
+#    argument to a convolutional layer’s constructor is the number of
+#    input channels.** Here, it is 1. If we were building this model to
+#    look at 3-color channels, it would be 3.
+# -  A convolutional layer is like a window that scans over the image,
+#    looking for a pattern it recognizes. These patterns are called
+#    *features,* and one of the parameters of a convolutional layer is the
+#    number of features we would like it to learn. **This is the second
+#    argument to the constructor is the number of output features.** Here,
+#    we’re asking our layer to learn 6 features.
+# -  Just above, I likened the convolutional layer to a window - but how
+#    big is the window? **The third argument is the window or kernel
+#    size.** Here, the “5” means we’ve chosen a 5x5 kernel. (If you want a
+#    kernel with height different from width, you can specify a tuple for
+#    this argument - e.g., ``(3, 5)`` to get a 3x5 convolution kernel.)
+# 
+# The output of a convolutional layer is an *activation map* - a spatial
+# representation of the presence of features in the input tensor.
+# ``conv1`` will give us an output tensor of 6x28x28; 6 is the number of
+# features, and 28 is the height and width of our map. (The 28 comes from
+# the fact that when scanning a 5-pixel window over a 32-pixel row, there
+# are only 28 valid positions.)
+# 
+# We then pass the output of the convolution through a ReLU activation
+# function (more on activation functions later), then through a max
+# pooling layer. The max pooling layer takes features near each other in
+# the activation map and groups them together. It does this by reducing
+# the tensor, merging every 2x2 group of cells in the output into a single
+# cell, and assigning that cell the maximum value of the 4 cells that went
+# into it. This gives us a lower-resolution version of the activation map,
+# with dimensions 6x14x14.
+# 
+# Our next convolutional layer, ``conv2``, expects 6 input channels
+# (corresponding to the 6 features sought by the first layer), has 16
+# output channels, and a 3x3 kernel. It puts out a 16x12x12 activation
+# map, which is again reduced by a max pooling layer to 16x6x6. Prior to
+# passing this output to the linear layers, it is reshaped to a 16 \* 6 \*
+# 6 = 576-element vector for consumption by the next layer.
+# 
+# There are convolutional layers for addressing 1D, 2D, and 3D tensors.
+# There are also many more optional arguments for a conv layer
+# constructor, including stride length(e.g., only scanning every second or
+# every third position) in the input, padding (so you can scan out to the
+# edges of the input), and more. See the
+# `documentation <https://pytorch.org/docs/stable/nn.html#convolution-layers>`__
+# for more information.
+# 
+# Recurrent Layers
+# ~~~~~~~~~~~~~~~~
+# 
+# *Recurrent neural networks* (or *RNNs)* are used for sequential data -
+# anything from time-series measurements from a scientific instrument to
+# natural language sentences to DNA nucleotides. An RNN does this by
+# maintaining a *hidden state* that acts as a sort of memory for what it
+# has seen in the sequence so far.
+# 
+# The internal structure of an RNN layer - or its variants, the LSTM (long
+# short-term memory) and GRU (gated recurrent unit) - is moderately
+# complex and beyond the scope of this video, but we’ll show you what one
+# looks like in action with an LSTM-based part-of-speech tagger (a type of
+# classifier that tells you if a word is a noun, verb, etc.):
+# 
+
+class LSTMTagger(torch.nn.Module):
+
+    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
+        super(LSTMTagger, self).__init__()
+        self.hidden_dim = hidden_dim
+
+        self.word_embeddings = torch.nn.Embedding(vocab_size, embedding_dim)
+
+        # The LSTM takes word embeddings as inputs, and outputs hidden states
+        # with dimensionality hidden_dim.
+        self.lstm = torch.nn.LSTM(embedding_dim, hidden_dim)
+
+        # The linear layer that maps from hidden state space to tag space
+        self.hidden2tag = torch.nn.Linear(hidden_dim, tagset_size)
+
+    def forward(self, sentence):
+        embeds = self.word_embeddings(sentence)
+        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
+        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
+        tag_scores = F.log_softmax(tag_space, dim=1)
+        return tag_scores
+
+
+########################################################################
+# The constructor has four arguments:
+# 
+# -  ``vocab_size`` is the number of words in the input vocabulary. Each
+#    word is a one-hot vector (or unit vector) in a
+#    ``vocab_size``-dimensional space.
+# -  ``tagset_size`` is the number of tags in the output set.
+# -  ``embedding_dim`` is the size of the *embedding* space for the
+#    vocabulary. An embedding maps a vocabulary onto a low-dimensional
+#    space, where words with similar meanings are close together in the
+#    space.
+# -  ``hidden_dim`` is the size of the LSTM’s memory.
+# 
+# The input will be a sentence with the words represented as indices of
+# one-hot vectors. The embedding layer will then map these down to an
+# ``embedding_dim``-dimensional space. The LSTM takes this sequence of
+# embeddings and iterates over it, fielding an output vector of length
+# ``hidden_dim``. The final linear layer acts as a classifier; applying
+# ``log_softmax()`` to the output of the final layer converts the output
+# into a normalized set of estimated probabilities that a given word maps
+# to a given tag.
+# 
+# If you’d like to see this network in action, check out the `Sequence
+# Models and LSTM
+# Networks <https://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html>`__
+# tutorial on pytorch.org.
+# 
+# Transformers
+# ~~~~~~~~~~~~
+# 
+# *Transformers* are multi-purpose networks that have taken over the state
+# of the art in NLP with models like BERT. A discussion of transformer
+# architecture is beyond the scope of this video, but PyTorch has a
+# ``Transformer`` class that allows you to define the overall parameters
+# of a transformer model - the number of attention heads, the number of
+# encoder & decoder layers, dropout and activation functions, etc. (You
+# can even build the BERT model from this single class, with the right
+# parameters!) The ``torch.nn.Transformer`` class also has classes to
+# encapsulate the individual components (``TransformerEncoder``,
+# ``TransformerDecoder``) and subcomponents (``TransformerEncoderLayer``,
+# ``TransformerDecoderLayer``). For details, check out the
+# `documentation <https://pytorch.org/docs/stable/nn.html#transformer-layers>`__
+# on transformer classes.
+# 
+# Other Layers and Functions
+# --------------------------
+# 
+# Data Manipulation Layers
+# ~~~~~~~~~~~~~~~~~~~~~~~~
+# 
+# There are other layer types that perform important functions in models,
+# but don’t participate in the learning process themselves.
+# 
+# **Max pooling** (and its twin, min pooling) reduce a tensor by combining
+# cells, and assigning the maximum value of the input cells to the output
+# cell (we saw this). For example:
+# 
+
+my_tensor = torch.rand(1, 6, 6)
+print(my_tensor)
+
+maxpool_layer = torch.nn.MaxPool2d(3)
+print(maxpool_layer(my_tensor))
+
+
+#########################################################################
+# If you look closely at the values above, you’ll see that each of the
+# values in the maxpooled output is the maximum value of each quadrant of
+# the 6x6 input.
+# 
+# **Normalization layers** re-center and normalize the output of one layer
+# before feeding it to another. Centering and scaling the intermediate
+# tensors has a number of beneficial effects, such as letting you use
+# higher learning rates without exploding/vanishing gradients.
+# 
+
+my_tensor = torch.rand(1, 4, 4) * 20 + 5
+print(my_tensor)
+
+print(my_tensor.mean())
+
+norm_layer = torch.nn.BatchNorm1d(4)
+normed_tensor = norm_layer(my_tensor)
+print(normed_tensor)
+
+print(normed_tensor.mean())
+
+
+
+##########################################################################
+# Running the cell above, we’ve added a large scaling factor and offset to
+# an input tensor; you should see the input tensor’s ``mean()`` somewhere
+# in the neighborhood of 15. After running it through the normalization
+# layer, you can see that the values are smaller, and grouped around zero
+# - in fact, the mean should be very small (> 1e-8).
+# 
+# This is beneficial because many activation functions (discussed below)
+# have their strongest gradients near 0, but sometimes suffer from
+# vanishing or exploding gradients for inputs that drive them far away
+# from zero. Keeping the data centered around the area of steepest
+# gradient will tend to mean faster, better learning and higher feasible
+# learning rates.
+# 
+# **Dropout layers** are a tool for encouraging *sparse representations*
+# in your model - that is, pushing it to do inference with less data.
+# 
+# Dropout layers work by randomly setting parts of the input tensor
+# *during training* - dropout layers are always turned off for inference.
+# This forces the model to learn against this masked or reduced dataset.
+# For example:
+# 
+
+my_tensor = torch.rand(1, 4, 4)
+
+dropout = torch.nn.Dropout(p=0.4)
+print(dropout(my_tensor))
+print(dropout(my_tensor))
+
+
+##########################################################################
+# Above, you can see the effect of dropout on a sample tensor. You can use
+# the optional ``p`` argument to set the probability of an individual
+# weight dropping out; if you don’t it defaults to 0.5.
+# 
+# Activation Functions
+# ~~~~~~~~~~~~~~~~~~~~
+# 
+# Activation functions make deep learning possible. A neural network is
+# really a program - with many parameters - that *simulates a mathematical
+# function*. If all we did was multiple tensors by layer weights
+# repeatedly, we could only simulate *linear functions;* further, there
+# would be no point to having many layers, as the whole network would
+# reduce could be reduced to a single matrix multiplication. Inserting
+# *non-linear* activation functions between layers is what allows a deep
+# learning model to simulate any function, rather than just linear ones.
+# 
+# ``torch.nn.Module`` has objects encapsulating all of the major
+# activation functions including ReLU and its many variants, Tanh,
+# Hardtanh, sigmoid, and more. It also includes other functions, such as
+# Softmax, that are most useful at the output stage of a model.
+# 
+# Loss Functions
+# ~~~~~~~~~~~~~~
+# 
+# Loss functions tell us how far a model’s prediction is from the correct
+# answer. PyTorch contains a variety of loss functions, including common
+# MSE (mean squared error = L2 norm), Cross Entropy Loss and Negative
+# Likelihood Loss (useful for classifiers), and others.
+# 
diff --git a/beginner_source/introyt/tensorboardyt_tutorial.py b/beginner_source/introyt/tensorboardyt_tutorial.py
new file mode 100644
index 00000000000..49d321bd6df
--- /dev/null
+++ b/beginner_source/introyt/tensorboardyt_tutorial.py
@@ -0,0 +1,327 @@
+"""
+`Introduction <introyt1_tutorial.html>`_ ||
+`Tensors <tensors_deeper_tutorial.html>`_ ||
+`Autograd <autogradyt_tutorial.html>`_ ||
+`Building Models <modelsyt_tutorial.html>`_ ||
+**TensorBoard Support** ||
+`Training Models <trainingyt.html>`_ ||
+`Model Understanding <captumyt.html>`_
+
+PyTorch TensorBoard Support
+===========================
+
+Follow along with the video below or on `youtube <https://www.youtube.com/watch?v=6CEld3hZgqc>`__.
+
+.. raw:: html
+
+   <div style="margin-top:10px; margin-bottom:10px;">
+     <iframe width="560" height="315" src="https://www.youtube.com/embed/6CEld3hZgqc" frameborder="0" allow="accelerometer; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
+   </div>
+
+Before You Start
+----------------
+
+To run this tutorial, you’ll need to install PyTorch, TorchVision,
+Matplotlib, and TensorBoard.
+
+With ``conda``:
+
+.. code-block:: sh
+
+    conda install pytorch torchvision -c pytorch
+    conda install matplotlib tensorboard
+
+With ``pip``:
+
+.. code-block:: sh
+
+    pip install torch torchvision matplotlib tensorboard
+
+Once the dependencies are installed, restart this notebook in the Python
+environment where you installed them.
+
+
+Introduction
+------------
+ 
+In this notebook, we’ll be training a variant of LeNet-5 against the
+Fashion-MNIST dataset. Fashion-MNIST is a set of image tiles depicting
+various garments, with ten class labels indicating the type of garment
+depicted. 
+
+"""
+
+# PyTorch model and training necessities
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+
+# Image datasets and image manipulation
+import torchvision
+import torchvision.transforms as transforms
+
+# Image display
+import matplotlib.pyplot as plt
+import numpy as np
+
+# PyTorch TensorBoard support
+from torch.utils.tensorboard import SummaryWriter
+
+# In case you are using an environment that has TensorFlow installed,
+# such as Google Colab, uncomment the following code to avoid
+# a bug with saving embeddings to your TensorBoard directory
+
+# import tensorflow as tf
+# import tensorboard as tb
+# tf.io.gfile = tb.compat.tensorflow_stub.io.gfile
+
+######################################################################
+# Showing Images in TensorBoard
+# -----------------------------
+# 
+# Let’s start by adding sample images from our dataset to TensorBoard:
+# 
+
+# Gather datasets and prepare them for consumption
+transform = transforms.Compose(
+    [transforms.ToTensor(),
+    transforms.Normalize((0.5,), (0.5,))])
+
+# Store separate training and validations splits in ./data
+training_set = torchvision.datasets.FashionMNIST('./data',
+    download=True,
+    train=True,
+    transform=transform)
+validation_set = torchvision.datasets.FashionMNIST('./data',
+    download=True,
+    train=False,
+    transform=transform)
+
+training_loader = torch.utils.data.DataLoader(training_set,
+                                              batch_size=4,
+                                              shuffle=True,
+                                              num_workers=2)
+
+
+validation_loader = torch.utils.data.DataLoader(validation_set,
+                                                batch_size=4,
+                                                shuffle=False,
+                                                num_workers=2)
+
+# Class labels
+classes = ('T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat',
+        'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle Boot')
+
+# Helper function for inline image display
+def matplotlib_imshow(img, one_channel=False):
+    if one_channel:
+        img = img.mean(dim=0)
+    img = img / 2 + 0.5     # unnormalize
+    npimg = img.numpy()
+    if one_channel:
+        plt.imshow(npimg, cmap="Greys")
+    else:
+        plt.imshow(np.transpose(npimg, (1, 2, 0)))
+
+# Extract a batch of 4 images
+dataiter = iter(training_loader)
+images, labels = next(dataiter)
+
+# Create a grid from the images and show them
+img_grid = torchvision.utils.make_grid(images)
+matplotlib_imshow(img_grid, one_channel=True)
+
+
+########################################################################
+# Above, we used TorchVision and Matplotlib to create a visual grid of a
+# minibatch of our input data. Below, we use the ``add_image()`` call on
+# ``SummaryWriter`` to log the image for consumption by TensorBoard, and
+# we also call ``flush()`` to make sure it’s written to disk right away.
+# 
+
+# Default log_dir argument is "runs" - but it's good to be specific
+# torch.utils.tensorboard.SummaryWriter is imported above
+writer = SummaryWriter('runs/fashion_mnist_experiment_1')
+
+# Write image data to TensorBoard log dir
+writer.add_image('Four Fashion-MNIST Images', img_grid)
+writer.flush()
+
+# To view, start TensorBoard on the command line with:
+#   tensorboard --logdir=runs
+# ...and open a browser tab to http://localhost:6006/
+
+
+##########################################################################
+# If you start TensorBoard at the command line and open it in a new
+# browser tab (usually at `localhost:6006 <localhost:6006>`__), you should
+# see the image grid under the IMAGES tab.
+# 
+# Graphing Scalars to Visualize Training
+# --------------------------------------
+# 
+# TensorBoard is useful for tracking the progress and efficacy of your
+# training. Below, we’ll run a training loop, track some metrics, and save
+# the data for TensorBoard’s consumption.
+# 
+# Let’s define a model to categorize our image tiles, and an optimizer and
+# loss function for training:
+# 
+
+class Net(nn.Module):
+    def __init__(self):
+        super(Net, self).__init__()
+        self.conv1 = nn.Conv2d(1, 6, 5)
+        self.pool = nn.MaxPool2d(2, 2)
+        self.conv2 = nn.Conv2d(6, 16, 5)
+        self.fc1 = nn.Linear(16 * 4 * 4, 120)
+        self.fc2 = nn.Linear(120, 84)
+        self.fc3 = nn.Linear(84, 10)
+
+    def forward(self, x):
+        x = self.pool(F.relu(self.conv1(x)))
+        x = self.pool(F.relu(self.conv2(x)))
+        x = x.view(-1, 16 * 4 * 4)
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        x = self.fc3(x)
+        return x
+    
+
+net = Net()
+criterion = nn.CrossEntropyLoss()
+optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
+
+
+##########################################################################
+# Now let’s train a single epoch, and evaluate the training vs. validation
+# set losses every 1000 batches:
+# 
+
+print(len(validation_loader))
+for epoch in range(1):  # loop over the dataset multiple times
+    running_loss = 0.0
+
+    for i, data in enumerate(training_loader, 0):
+        # basic training loop
+        inputs, labels = data
+        optimizer.zero_grad()
+        outputs = net(inputs)
+        loss = criterion(outputs, labels)
+        loss.backward()
+        optimizer.step()
+
+        running_loss += loss.item()
+        if i % 1000 == 999:    # Every 1000 mini-batches...
+            print('Batch {}'.format(i + 1))
+            # Check against the validation set
+            running_vloss = 0.0
+            
+            # In evaluation mode some model specific operations can be omitted eg. dropout layer
+            net.train(False) # Switching to evaluation mode, eg. turning off regularisation
+            for j, vdata in enumerate(validation_loader, 0):
+                vinputs, vlabels = vdata
+                voutputs = net(vinputs)
+                vloss = criterion(voutputs, vlabels)
+                running_vloss += vloss.item()
+            net.train(True) # Switching back to training mode, eg. turning on regularisation
+            
+            avg_loss = running_loss / 1000
+            avg_vloss = running_vloss / len(validation_loader)
+            
+            # Log the running loss averaged per batch
+            writer.add_scalars('Training vs. Validation Loss',
+                            { 'Training' : avg_loss, 'Validation' : avg_vloss },
+                            epoch * len(training_loader) + i)
+
+            running_loss = 0.0
+print('Finished Training')
+
+writer.flush()
+
+
+#########################################################################
+# Switch to your open TensorBoard and have a look at the SCALARS tab.
+# 
+# Visualizing Your Model
+# ----------------------
+# 
+# TensorBoard can also be used to examine the data flow within your model.
+# To do this, call the ``add_graph()`` method with a model and sample
+# input:
+# 
+
+# Again, grab a single mini-batch of images
+dataiter = iter(training_loader)
+images, labels = next(dataiter)
+
+# add_graph() will trace the sample input through your model,
+# and render it as a graph.
+writer.add_graph(net, images)
+writer.flush()
+
+
+#########################################################################
+# When you switch over to TensorBoard, you should see a GRAPHS tab.
+# Double-click the “NET” node to see the layers and data flow within your
+# model.
+# 
+# Visualizing Your Dataset with Embeddings
+# ----------------------------------------
+# 
+# The 28-by-28 image tiles we’re using can be modeled as 784-dimensional
+# vectors (28 \* 28 = 784). It can be instructive to project this to a
+# lower-dimensional representation. The ``add_embedding()`` method will
+# project a set of data onto the three dimensions with highest variance,
+# and display them as an interactive 3D chart. The ``add_embedding()``
+# method does this automatically by projecting to the three dimensions
+# with highest variance.
+# 
+# Below, we’ll take a sample of our data, and generate such an embedding:
+# 
+
+# Select a random subset of data and corresponding labels
+def select_n_random(data, labels, n=100):
+    assert len(data) == len(labels)
+
+    perm = torch.randperm(len(data))
+    return data[perm][:n], labels[perm][:n]
+
+# Extract a random subset of data
+images, labels = select_n_random(training_set.data, training_set.targets)
+
+# get the class labels for each image
+class_labels = [classes[label] for label in labels]
+
+# log embeddings
+features = images.view(-1, 28 * 28)
+writer.add_embedding(features,
+                    metadata=class_labels,
+                    label_img=images.unsqueeze(1))
+writer.flush()
+writer.close()
+
+
+#######################################################################
+# Now if you switch to TensorBoard and select the PROJECTOR tab, you
+# should see a 3D representation of the projection. You can rotate and
+# zoom the model. Examine it at large and small scales, and see whether
+# you can spot patterns in the projected data and the clustering of
+# labels.
+# 
+# For better visibility, it’s recommended to:
+# 
+# - Select “label” from the “Color by” drop-down on the left.
+# - Toggle the Night Mode icon along the top to place the
+#   light-colored images on a dark background.
+# 
+# Other Resources
+# ---------------
+# 
+# For more information, have a look at:
+# 
+# - PyTorch documentation on `torch.utils.tensorboard.SummaryWriter <https://pytorch.org/docs/stable/tensorboard.html?highlight=summarywriter>`__
+# - Tensorboard tutorial content in the `PyTorch.org Tutorials <https://pytorch.org/tutorials/>`__ 
+# - For more information about TensorBoard, see the `TensorBoard
+#   documentation <https://www.tensorflow.org/tensorboard>`__
diff --git a/beginner_source/introyt/tensors_deeper_tutorial.py b/beginner_source/introyt/tensors_deeper_tutorial.py
new file mode 100644
index 00000000000..4d118ad4030
--- /dev/null
+++ b/beginner_source/introyt/tensors_deeper_tutorial.py
@@ -0,0 +1,938 @@
+"""
+`Introduction <introyt1_tutorial.html>`_ ||
+**Tensors** ||
+`Autograd <autogradyt_tutorial.html>`_ ||
+`Building Models <modelsyt_tutorial.html>`_ ||
+`TensorBoard Support <tensorboardyt_tutorial.html>`_ ||
+`Training Models <trainingyt.html>`_ ||
+`Model Understanding <captumyt.html>`_
+
+Introduction to PyTorch Tensors
+===============================
+
+Follow along with the video below or on `youtube <https://www.youtube.com/watch?v=r7QDUPb2dCM>`__.
+
+.. raw:: html
+
+   <div style="margin-top:10px; margin-bottom:10px;">
+     <iframe width="560" height="315" src="https://www.youtube.com/embed/r7QDUPb2dCM" frameborder="0" allow="accelerometer; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
+   </div>
+
+Tensors are the central data abstraction in PyTorch. This interactive
+notebook provides an in-depth introduction to the ``torch.Tensor``
+class.
+
+First things first, let’s import the PyTorch module. We’ll also add
+Python’s math module to facilitate some of the examples.
+
+"""
+
+import torch
+import math
+
+
+#########################################################################
+# Creating Tensors
+# ----------------
+# 
+# The simplest way to create a tensor is with the ``torch.empty()`` call:
+# 
+
+x = torch.empty(3, 4)
+print(type(x))
+print(x)
+
+
+##########################################################################
+# Let’s upack what we just did:
+# 
+# -  We created a tensor using one of the numerous factory methods
+#    attached to the ``torch`` module.
+# -  The tensor itself is 2-dimensional, having 3 rows and 4 columns.
+# -  The type of the object returned is ``torch.Tensor``, which is an
+#    alias for ``torch.FloatTensor``; by default, PyTorch tensors are
+#    populated with 32-bit floating point numbers. (More on data types
+#    below.)
+# -  You will probably see some random-looking values when printing your
+#    tensor. The ``torch.empty()`` call allocates memory for the tensor,
+#    but does not initialize it with any values - so what you’re seeing is
+#    whatever was in memory at the time of allocation.
+# 
+# A brief note about tensors and their number of dimensions, and
+# terminology:
+# 
+# -  You will sometimes see a 1-dimensional tensor called a
+#    *vector.* 
+# -  Likewise, a 2-dimensional tensor is often referred to as a
+#    *matrix.* 
+# -  Anything with more than two dimensions is generally just
+#    called a tensor.
+# 
+# More often than not, you’ll want to initialize your tensor with some
+# value. Common cases are all zeros, all ones, or random values, and the
+# ``torch`` module provides factory methods for all of these:
+# 
+
+zeros = torch.zeros(2, 3)
+print(zeros)
+
+ones = torch.ones(2, 3)
+print(ones)
+
+torch.manual_seed(1729)
+random = torch.rand(2, 3)
+print(random)
+
+
+#########################################################################
+# The factory methods all do just what you’d expect - we have a tensor
+# full of zeros, another full of ones, and another with random values
+# between 0 and 1.
+# 
+# Random Tensors and Seeding
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~
+# 
+# Speaking of the random tensor, did you notice the call to
+# ``torch.manual_seed()`` immediately preceding it? Initializing tensors,
+# such as a model’s learning weights, with random values is common but
+# there are times - especially in research settings - where you’ll want
+# some assurance of the reproducibility of your results. Manually setting
+# your random number generator’s seed is the way to do this. Let’s look
+# more closely:
+# 
+
+torch.manual_seed(1729)
+random1 = torch.rand(2, 3)
+print(random1)
+
+random2 = torch.rand(2, 3)
+print(random2)
+
+torch.manual_seed(1729)
+random3 = torch.rand(2, 3)
+print(random3)
+
+random4 = torch.rand(2, 3)
+print(random4)
+
+
+############################################################################
+# What you should see above is that ``random1`` and ``random3`` carry
+# identical values, as do ``random2`` and ``random4``. Manually setting
+# the RNG’s seed resets it, so that identical computations depending on
+# random number should, in most settings, provide identical results.
+# 
+# For more information, see the `PyTorch documentation on
+# reproducibility <https://pytorch.org/docs/stable/notes/randomness.html>`__.
+# 
+# Tensor Shapes
+# ~~~~~~~~~~~~~
+# 
+# Often, when you’re performing operations on two or more tensors, they
+# will need to be of the same *shape* - that is, having the same number of
+# dimensions and the same number of cells in each dimension. For that, we
+# have the ``torch.*_like()`` methods:
+# 
+
+x = torch.empty(2, 2, 3)
+print(x.shape)
+print(x)
+
+empty_like_x = torch.empty_like(x)
+print(empty_like_x.shape)
+print(empty_like_x)
+
+zeros_like_x = torch.zeros_like(x)
+print(zeros_like_x.shape)
+print(zeros_like_x)
+
+ones_like_x = torch.ones_like(x)
+print(ones_like_x.shape)
+print(ones_like_x)
+
+rand_like_x = torch.rand_like(x)
+print(rand_like_x.shape)
+print(rand_like_x)
+
+
+#########################################################################
+# The first new thing in the code cell above is the use of the ``.shape``
+# property on a tensor. This property contains a list of the extent of
+# each dimension of a tensor - in our case, ``x`` is a three-dimensional
+# tensor with shape 2 x 2 x 3.
+# 
+# Below that, we call the ``.empty_like()``, ``.zeros_like()``,
+# ``.ones_like()``, and ``.rand_like()`` methods. Using the ``.shape``
+# property, we can verify that each of these methods returns a tensor of
+# identical dimensionality and extent.
+# 
+# The last way to create a tensor that will cover is to specify its data
+# directly from a PyTorch collection:
+# 
+
+some_constants = torch.tensor([[3.1415926, 2.71828], [1.61803, 0.0072897]])
+print(some_constants)
+
+some_integers = torch.tensor((2, 3, 5, 7, 11, 13, 17, 19))
+print(some_integers)
+
+more_integers = torch.tensor(((2, 4, 6), [3, 6, 9]))
+print(more_integers)
+
+
+######################################################################
+# Using ``torch.tensor()`` is the most straightforward way to create a
+# tensor if you already have data in a Python tuple or list. As shown
+# above, nesting the collections will result in a multi-dimensional
+# tensor.
+# 
+# .. note::
+#      ``torch.tensor()`` creates a copy of the data.
+# 
+# Tensor Data Types
+# ~~~~~~~~~~~~~~~~~
+# 
+# Setting the datatype of a tensor is possible a couple of ways:
+# 
+
+a = torch.ones((2, 3), dtype=torch.int16)
+print(a)
+
+b = torch.rand((2, 3), dtype=torch.float64) * 20.
+print(b)
+
+c = b.to(torch.int32)
+print(c)
+
+
+##########################################################################
+# The simplest way to set the underlying data type of a tensor is with an
+# optional argument at creation time. In the first line of the cell above,
+# we set ``dtype=torch.int16`` for the tensor ``a``. When we print ``a``,
+# we can see that it’s full of ``1`` rather than ``1.`` - Python’s subtle
+# cue that this is an integer type rather than floating point.
+# 
+# Another thing to notice about printing ``a`` is that, unlike when we
+# left ``dtype`` as the default (32-bit floating point), printing the
+# tensor also specifies its ``dtype``.
+# 
+# You may have also spotted that we went from specifying the tensor’s
+# shape as a series of integer arguments, to grouping those arguments in a
+# tuple. This is not strictly necessary - PyTorch will take a series of
+# initial, unlabeled integer arguments as a tensor shape - but when adding
+# the optional arguments, it can make your intent more readable.
+# 
+# The other way to set the datatype is with the ``.to()`` method. In the
+# cell above, we create a random floating point tensor ``b`` in the usual
+# way. Following that, we create ``c`` by converting ``b`` to a 32-bit
+# integer with the ``.to()`` method. Note that ``c`` contains all the same
+# values as ``b``, but truncated to integers.
+# 
+# For more information, see the `data types documentation <https://pytorch.org/docs/stable/tensor_attributes.html#torch.dtype>`__.
+# 
+# Math & Logic with PyTorch Tensors
+# ---------------------------------
+# 
+# Now that you know some of the ways to create a tensor… what can you do
+# with them?
+# 
+# Let’s look at basic arithmetic first, and how tensors interact with
+# simple scalars:
+# 
+
+ones = torch.zeros(2, 2) + 1
+twos = torch.ones(2, 2) * 2
+threes = (torch.ones(2, 2) * 7 - 1) / 2
+fours = twos ** 2
+sqrt2s = twos ** 0.5
+
+print(ones)
+print(twos)
+print(threes)
+print(fours)
+print(sqrt2s)
+
+
+##########################################################################
+# As you can see above, arithmetic operations between tensors and scalars,
+# such as addition, subtraction, multiplication, division, and
+# exponentiation are distributed over every element of the tensor. Because
+# the output of such an operation will be a tensor, you can chain them
+# together with the usual operator precedence rules, as in the line where
+# we create ``threes``.
+# 
+# Similar operations between two tensors also behave like you’d
+# intuitively expect:
+# 
+
+powers2 = twos ** torch.tensor([[1, 2], [3, 4]])
+print(powers2)
+
+fives = ones + fours
+print(fives)
+
+dozens = threes * fours
+print(dozens)
+
+
+##########################################################################
+# It’s important to note here that all of the tensors in the previous code
+# cell were of identical shape. What happens when we try to perform a
+# binary operation on tensors if dissimilar shape?
+# 
+# .. note::
+#    The following cell throws a run-time error. This is intentional.
+#
+#    .. code-block:: sh
+#
+#       a = torch.rand(2, 3)
+#       b = torch.rand(3, 2)
+#
+#       print(a * b)
+#
+
+
+##########################################################################
+# In the general case, you cannot operate on tensors of different shape
+# this way, even in a case like the cell above, where the tensors have an
+# identical number of elements.
+# 
+# In Brief: Tensor Broadcasting
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# 
+# .. note::
+#      If you are familiar with broadcasting semantics in NumPy
+#      ndarrays, you’ll find the same rules apply here.
+# 
+# The exception to the same-shapes rule is *tensor broadcasting.* Here’s
+# an example:
+# 
+
+rand = torch.rand(2, 4)
+doubled = rand * (torch.ones(1, 4) * 2)
+
+print(rand)
+print(doubled)
+
+
+#########################################################################
+# What’s the trick here? How is it we got to multiply a 2x4 tensor by a
+# 1x4 tensor?
+# 
+# Broadcasting is a way to perform an operation between tensors that have
+# similarities in their shapes. In the example above, the one-row,
+# four-column tensor is multiplied by *both rows* of the two-row,
+# four-column tensor.
+# 
+# This is an important operation in Deep Learning. The common example is
+# multiplying a tensor of learning weights by a *batch* of input tensors,
+# applying the operation to each instance in the batch separately, and
+# returning a tensor of identical shape - just like our (2, 4) \* (1, 4)
+# example above returned a tensor of shape (2, 4).
+# 
+# The rules for broadcasting are:
+# 
+# -  Each tensor must have at least one dimension - no empty tensors.
+# 
+# -  Comparing the dimension sizes of the two tensors, *going from last to
+#    first:*
+# 
+#    -  Each dimension must be equal, *or*
+# 
+#    -  One of the dimensions must be of size 1, *or*
+# 
+#    -  The dimension does not exist in one of the tensors
+# 
+# Tensors of identical shape, of course, are trivially “broadcastable”, as
+# you saw earlier.
+# 
+# Here are some examples of situations that honor the above rules and
+# allow broadcasting:
+# 
+
+a =     torch.ones(4, 3, 2)
+
+b = a * torch.rand(   3, 2) # 3rd & 2nd dims identical to a, dim 1 absent
+print(b)
+
+c = a * torch.rand(   3, 1) # 3rd dim = 1, 2nd dim identical to a
+print(c)
+
+d = a * torch.rand(   1, 2) # 3rd dim identical to a, 2nd dim = 1
+print(d)
+
+
+#############################################################################
+# Look closely at the values of each tensor above: 
+#
+# -  The multiplication operation that created ``b`` was 
+#    broadcast over every “layer” of ``a``.
+# -  For ``c``, the operation was broadcast over every layer and row of
+#    ``a`` - every 3-element column is identical. 
+# -  For ``d``, we switched it around - now every *row* is identical,
+#    across layers and columns.
+# 
+# For more information on broadcasting, see the `PyTorch
+# documentation <https://pytorch.org/docs/stable/notes/broadcasting.html>`__
+# on the topic.
+# 
+# Here are some examples of attempts at broadcasting that will fail:
+# 
+# .. note::
+#    The following cell throws a run-time error. This is intentional.
+#
+#    .. code-block:: python
+#
+#       a =     torch.ones(4, 3, 2)
+#
+#       b = a * torch.rand(4, 3)    # dimensions must match last-to-first
+#
+#       c = a * torch.rand(   2, 3) # both 3rd & 2nd dims different
+#
+#       d = a * torch.rand((0, ))   # can't broadcast with an empty tensor
+#
+
+
+###########################################################################
+# More Math with Tensors
+# ~~~~~~~~~~~~~~~~~~~~~~
+# 
+# PyTorch tensors have over three hundred operations that can be performed
+# on them.
+# 
+# Here is a small sample from some of the major categories of operations:
+# 
+
+# common functions
+a = torch.rand(2, 4) * 2 - 1
+print('Common functions:')
+print(torch.abs(a))
+print(torch.ceil(a))
+print(torch.floor(a))
+print(torch.clamp(a, -0.5, 0.5))
+
+# trigonometric functions and their inverses
+angles = torch.tensor([0, math.pi / 4, math.pi / 2, 3 * math.pi / 4])
+sines = torch.sin(angles)
+inverses = torch.asin(sines)
+print('\nSine and arcsine:')
+print(angles)
+print(sines)
+print(inverses)
+
+# bitwise operations
+print('\nBitwise XOR:')
+b = torch.tensor([1, 5, 11])
+c = torch.tensor([2, 7, 10])
+print(torch.bitwise_xor(b, c))
+
+# comparisons:
+print('\nBroadcasted, element-wise equality comparison:')
+d = torch.tensor([[1., 2.], [3., 4.]])
+e = torch.ones(1, 2)  # many comparison ops support broadcasting!
+print(torch.eq(d, e)) # returns a tensor of type bool
+
+# reductions:
+print('\nReduction ops:')
+print(torch.max(d))        # returns a single-element tensor
+print(torch.max(d).item()) # extracts the value from the returned tensor
+print(torch.mean(d))       # average
+print(torch.std(d))        # standard deviation
+print(torch.prod(d))       # product of all numbers
+print(torch.unique(torch.tensor([1, 2, 1, 2, 1, 2]))) # filter unique elements
+
+# vector and linear algebra operations
+v1 = torch.tensor([1., 0., 0.])         # x unit vector
+v2 = torch.tensor([0., 1., 0.])         # y unit vector
+m1 = torch.rand(2, 2)                   # random matrix
+m2 = torch.tensor([[3., 0.], [0., 3.]]) # three times identity matrix
+
+print('\nVectors & Matrices:')
+print(torch.linalg.cross(v2, v1)) # negative of z unit vector (v1 x v2 == -v2 x v1)
+print(m1)
+m3 = torch.linalg.matmul(m1, m2)
+print(m3)                  # 3 times m1
+print(torch.linalg.svd(m3))       # singular value decomposition
+
+
+##################################################################################
+# This is a small sample of operations. For more details and the full inventory of
+# math functions, have a look at the
+# `documentation <https://pytorch.org/docs/stable/torch.html#math-operations>`__.
+# For more details and the full inventory of linear algebra operations, have a
+# look at this `documentation <https://pytorch.org/docs/stable/linalg.html>`__.
+# 
+# Altering Tensors in Place
+# ~~~~~~~~~~~~~~~~~~~~~~~~~
+# 
+# Most binary operations on tensors will return a third, new tensor. When
+# we say ``c = a * b`` (where ``a`` and ``b`` are tensors), the new tensor
+# ``c`` will occupy a region of memory distinct from the other tensors.
+# 
+# There are times, though, that you may wish to alter a tensor in place -
+# for example, if you’re doing an element-wise computation where you can
+# discard intermediate values. For this, most of the math functions have a
+# version with an appended underscore (``_``) that will alter a tensor in
+# place.
+# 
+# For example:
+# 
+
+a = torch.tensor([0, math.pi / 4, math.pi / 2, 3 * math.pi / 4])
+print('a:')
+print(a)
+print(torch.sin(a))   # this operation creates a new tensor in memory
+print(a)              # a has not changed
+
+b = torch.tensor([0, math.pi / 4, math.pi / 2, 3 * math.pi / 4])
+print('\nb:')
+print(b)
+print(torch.sin_(b))  # note the underscore
+print(b)              # b has changed
+
+
+#######################################################################
+# For arithmetic operations, there are functions that behave similarly:
+# 
+
+a = torch.ones(2, 2)
+b = torch.rand(2, 2)
+
+print('Before:')
+print(a)
+print(b)
+print('\nAfter adding:')
+print(a.add_(b))
+print(a)
+print(b)
+print('\nAfter multiplying')
+print(b.mul_(b))
+print(b)
+
+
+##########################################################################
+# Note that these in-place arithmetic functions are methods on the
+# ``torch.Tensor`` object, not attached to the ``torch`` module like many
+# other functions (e.g., ``torch.sin()``). As you can see from
+# ``a.add_(b)``, *the calling tensor is the one that gets changed in
+# place.*
+# 
+# There is another option for placing the result of a computation in an
+# existing, allocated tensor. Many of the methods and functions we’ve seen
+# so far - including creation methods! - have an ``out`` argument that
+# lets you specify a tensor to receive the output. If the ``out`` tensor
+# is the correct shape and ``dtype``, this can happen without a new memory
+# allocation:
+# 
+
+a = torch.rand(2, 2)
+b = torch.rand(2, 2)
+c = torch.zeros(2, 2)
+old_id = id(c)
+
+print(c)
+d = torch.matmul(a, b, out=c)
+print(c)                # contents of c have changed
+
+assert c is d           # test c & d are same object, not just containing equal values
+assert id(c) == old_id  # make sure that our new c is the same object as the old one
+
+torch.rand(2, 2, out=c) # works for creation too!
+print(c)                # c has changed again
+assert id(c) == old_id  # still the same object!
+
+
+##########################################################################
+# Copying Tensors
+# ---------------
+# 
+# As with any object in Python, assigning a tensor to a variable makes the
+# variable a *label* of the tensor, and does not copy it. For example:
+# 
+
+a = torch.ones(2, 2)
+b = a
+
+a[0][1] = 561  # we change a...
+print(b)       # ...and b is also altered
+
+
+######################################################################
+# But what if you want a separate copy of the data to work on? The
+# ``clone()`` method is there for you:
+# 
+
+a = torch.ones(2, 2)
+b = a.clone()
+
+assert b is not a      # different objects in memory...
+print(torch.eq(a, b))  # ...but still with the same contents!
+
+a[0][1] = 561          # a changes...
+print(b)               # ...but b is still all ones
+
+
+#########################################################################
+# **There is an important thing to be aware of when using ``clone()``.**
+# If your source tensor has autograd, enabled then so will the clone.
+# **This will be covered more deeply in the video on autograd,** but if
+# you want the light version of the details, continue on.
+# 
+# *In many cases, this will be what you want.* For example, if your model
+# has multiple computation paths in its ``forward()`` method, and *both*
+# the original tensor and its clone contribute to the model’s output, then
+# to enable model learning you want autograd turned on for both tensors.
+# If your source tensor has autograd enabled (which it generally will if
+# it’s a set of learning weights or derived from a computation involving
+# the weights), then you’ll get the result you want.
+# 
+# On the other hand, if you’re doing a computation where *neither* the
+# original tensor nor its clone need to track gradients, then as long as
+# the source tensor has autograd turned off, you’re good to go.
+# 
+# *There is a third case,* though: Imagine you’re performing a computation
+# in your model’s ``forward()`` function, where gradients are turned on
+# for everything by default, but you want to pull out some values
+# mid-stream to generate some metrics. In this case, you *don’t* want the
+# cloned copy of your source tensor to track gradients - performance is
+# improved with autograd’s history tracking turned off. For this, you can
+# use the ``.detach()`` method on the source tensor:
+# 
+
+a = torch.rand(2, 2, requires_grad=True) # turn on autograd
+print(a)
+
+b = a.clone()
+print(b)
+
+c = a.detach().clone()
+print(c)
+
+print(a)
+
+
+#########################################################################
+# What’s happening here?
+# 
+# -  We create ``a`` with ``requires_grad=True`` turned on. **We haven’t
+#    covered this optional argument yet, but will during the unit on
+#    autograd.**
+# -  When we print ``a``, it informs us that the property
+#    ``requires_grad=True`` - this means that autograd and computation
+#    history tracking are turned on.
+# -  We clone ``a`` and label it ``b``. When we print ``b``, we can see
+#    that it’s tracking its computation history - it has inherited
+#    ``a``\ ’s autograd settings, and added to the computation history.
+# -  We clone ``a`` into ``c``, but we call ``detach()`` first.
+# -  Printing ``c``, we see no computation history, and no
+#    ``requires_grad=True``.
+# 
+# The ``detach()`` method *detaches the tensor from its computation
+# history.* It says, “do whatever comes next as if autograd was off.” It
+# does this *without* changing ``a`` - you can see that when we print
+# ``a`` again at the end, it retains its ``requires_grad=True`` property.
+# 
+# Moving to `Accelerator <https://pytorch.org/docs/stable/torch.html#accelerators>`__
+# -------------
+# 
+# One of the major advantages of PyTorch is its robust acceleration on an
+# `accelerator <https://pytorch.org/docs/stable/torch.html#accelerators>`__
+# such as CUDA, MPS, MTIA, or XPU. 
+# So far, everything we’ve done has been on CPU. How do we move to the faster
+# hardware?
+# 
+# First, we should check whether an accelerator is available, with the
+# ``is_available()`` method.
+# 
+# .. note::
+#      If you do not have an accelerator, the executable cells in this section will not execute any
+#      accelerator-related code.
+# 
+
+if torch.accelerator.is_available():
+    print('We have an accelerator!')
+else:
+    print('Sorry, CPU only.')
+
+
+##########################################################################
+# Once we’ve determined that one or more accelerators is available, we need to put
+# our data someplace where the accelerator can see it. Your CPU does computation
+# on data in your computer’s RAM. Your accelerator has dedicated memory attached
+# to it. Whenever you want to perform a computation on a device, you must
+# move *all* the data needed for that computation to memory accessible by
+# that device. (Colloquially, “moving the data to memory accessible by the
+# GPU” is shorted to, “moving the data to the GPU”.)
+# 
+# There are multiple ways to get your data onto your target device. You
+# may do it at creation time:
+# 
+
+if torch.accelerator.is_available():
+    gpu_rand = torch.rand(2, 2, device=torch.accelerator.current_accelerator())
+    print(gpu_rand)
+else:
+    print('Sorry, CPU only.')
+
+
+##########################################################################
+# By default, new tensors are created on the CPU, so we have to specify
+# when we want to create our tensor on the accelerator with the optional
+# ``device`` argument. You can see when we print the new tensor, PyTorch
+# informs us which device it’s on (if it’s not on CPU).
+# 
+# You can query the number of accelerators with ``torch.accelerator.device_count()``. If
+# you have more than one accelerator, you can specify them by index, take CUDA for example:
+# ``device='cuda:0'``, ``device='cuda:1'``, etc.
+# 
+# As a coding practice, specifying our devices everywhere with string
+# constants is pretty fragile. In an ideal world, your code would perform
+# robustly whether you’re on CPU or accelerator hardware. You can do this by
+# creating a device handle that can be passed to your tensors instead of a
+# string:
+# 
+
+my_device = torch.accelerator.current_accelerator() if torch.accelerator.is_available() else torch.device('cpu')
+print('Device: {}'.format(my_device))
+
+x = torch.rand(2, 2, device=my_device)
+print(x)
+
+
+#########################################################################
+# If you have an existing tensor living on one device, you can move it to
+# another with the ``to()`` method. The following line of code creates a
+# tensor on CPU, and moves it to whichever device handle you acquired in
+# the previous cell.
+# 
+
+y = torch.rand(2, 2)
+y = y.to(my_device)
+
+
+##########################################################################
+# It is important to know that in order to do computation involving two or
+# more tensors, *all of the tensors must be on the same device*. The
+# following code will throw a runtime error, regardless of whether you
+# have an accelerator device available, take CUDA for example:
+# 
+# .. code-block:: python
+# 
+#    x = torch.rand(2, 2)
+#    y = torch.rand(2, 2, device='cuda')
+#    z = x + y  # exception will be thrown
+# 
+
+
+###########################################################################
+# Manipulating Tensor Shapes
+# --------------------------
+# 
+# Sometimes, you’ll need to change the shape of your tensor. Below, we’ll
+# look at a few common cases, and how to handle them.
+# 
+# Changing the Number of Dimensions
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# 
+# One case where you might need to change the number of dimensions is
+# passing a single instance of input to your model. PyTorch models
+# generally expect *batches* of input.
+# 
+# For example, imagine having a model that works on 3 x 226 x 226 images -
+# a 226-pixel square with 3 color channels. When you load and transform
+# it, you’ll get a tensor of shape ``(3, 226, 226)``. Your model, though,
+# is expecting input of shape ``(N, 3, 226, 226)``, where ``N`` is the
+# number of images in the batch. So how do you make a batch of one?
+# 
+
+a = torch.rand(3, 226, 226)
+b = a.unsqueeze(0)
+
+print(a.shape)
+print(b.shape)
+
+
+##########################################################################
+# The ``unsqueeze()`` method adds a dimension of extent 1.
+# ``unsqueeze(0)`` adds it as a new zeroth dimension - now you have a
+# batch of one!
+# 
+# So if that’s *un*\ squeezing? What do we mean by squeezing? We’re taking
+# advantage of the fact that any dimension of extent 1 *does not* change
+# the number of elements in the tensor.
+# 
+
+c = torch.rand(1, 1, 1, 1, 1)
+print(c)
+
+
+##########################################################################
+# Continuing the example above, let’s say the model’s output is a
+# 20-element vector for each input. You would then expect the output to
+# have shape ``(N, 20)``, where ``N`` is the number of instances in the
+# input batch. That means that for our single-input batch, we’ll get an
+# output of shape ``(1, 20)``.
+# 
+# What if you want to do some *non-batched* computation with that output -
+# something that’s just expecting a 20-element vector?
+# 
+
+a = torch.rand(1, 20)
+print(a.shape)
+print(a)
+
+b = a.squeeze(0)
+print(b.shape)
+print(b)
+
+c = torch.rand(2, 2)
+print(c.shape)
+
+d = c.squeeze(0)
+print(d.shape)
+
+
+#########################################################################
+# You can see from the shapes that our 2-dimensional tensor is now
+# 1-dimensional, and if you look closely at the output of the cell above
+# you’ll see that printing ``a`` shows an “extra” set of square brackets
+# ``[]`` due to having an extra dimension.
+# 
+# You may only ``squeeze()`` dimensions of extent 1. See above where we
+# try to squeeze a dimension of size 2 in ``c``, and get back the same
+# shape we started with. Calls to ``squeeze()`` and ``unsqueeze()`` can
+# only act on dimensions of extent 1 because to do otherwise would change
+# the number of elements in the tensor.
+# 
+# Another place you might use ``unsqueeze()`` is to ease broadcasting.
+# Recall the example above where we had the following code:
+# 
+# .. code-block:: python
+# 
+#    a = torch.ones(4, 3, 2)
+# 
+#    c = a * torch.rand(   3, 1) # 3rd dim = 1, 2nd dim identical to a
+#    print(c)
+# 
+# The net effect of that was to broadcast the operation over dimensions 0
+# and 2, causing the random, 3 x 1 tensor to be multiplied element-wise by
+# every 3-element column in ``a``.
+# 
+# What if the random vector had just been 3-element vector? We’d lose the
+# ability to do the broadcast, because the final dimensions would not
+# match up according to the broadcasting rules. ``unsqueeze()`` comes to
+# the rescue:
+# 
+
+a = torch.ones(4, 3, 2)
+b = torch.rand(   3)     # trying to multiply a * b will give a runtime error
+c = b.unsqueeze(1)       # change to a 2-dimensional tensor, adding new dim at the end
+print(c.shape)
+print(a * c)             # broadcasting works again!
+
+
+######################################################################
+# The ``squeeze()`` and ``unsqueeze()`` methods also have in-place
+# versions, ``squeeze_()`` and ``unsqueeze_()``:
+# 
+
+batch_me = torch.rand(3, 226, 226)
+print(batch_me.shape)
+batch_me.unsqueeze_(0)
+print(batch_me.shape)
+
+
+##########################################################################
+# Sometimes you’ll want to change the shape of a tensor more radically,
+# while still preserving the number of elements and their contents. One
+# case where this happens is at the interface between a convolutional
+# layer of a model and a linear layer of the model - this is common in
+# image classification models. A convolution kernel will yield an output
+# tensor of shape *features x width x height,* but the following linear
+# layer expects a 1-dimensional input. ``reshape()`` will do this for you,
+# provided that the dimensions you request yield the same number of
+# elements as the input tensor has:
+# 
+
+output3d = torch.rand(6, 20, 20)
+print(output3d.shape)
+
+input1d = output3d.reshape(6 * 20 * 20)
+print(input1d.shape)
+
+# can also call it as a method on the torch module:
+print(torch.reshape(output3d, (6 * 20 * 20,)).shape)
+
+
+###############################################################################
+# .. note::
+#      The ``(6 * 20 * 20,)`` argument in the final line of the cell
+#      above is because PyTorch expects a **tuple** when specifying a
+#      tensor shape - but when the shape is the first argument of a method, it
+#      lets us cheat and just use a series of integers. Here, we had to add the
+#      parentheses and comma to convince the method that this is really a
+#      one-element tuple.
+# 
+# When it can, ``reshape()`` will return a *view* on the tensor to be
+# changed - that is, a separate tensor object looking at the same
+# underlying region of memory. *This is important:* That means any change
+# made to the source tensor will be reflected in the view on that tensor,
+# unless you ``clone()`` it.
+# 
+# There *are* conditions, beyond the scope of this introduction, where
+# ``reshape()`` has to return a tensor carrying a copy of the data. For
+# more information, see the
+# `docs <https://pytorch.org/docs/stable/torch.html#torch.reshape>`__.
+# 
+
+
+#######################################################################
+# NumPy Bridge
+# ------------
+# 
+# In the section above on broadcasting, it was mentioned that PyTorch’s
+# broadcast semantics are compatible with NumPy’s - but the kinship
+# between PyTorch and NumPy goes even deeper than that.
+# 
+# If you have existing ML or scientific code with data stored in NumPy
+# ndarrays, you may wish to express that same data as PyTorch tensors,
+# whether to take advantage of PyTorch’s GPU acceleration, or its
+# efficient abstractions for building ML models. It’s easy to switch
+# between ndarrays and PyTorch tensors:
+# 
+
+import numpy as np
+
+numpy_array = np.ones((2, 3))
+print(numpy_array)
+
+pytorch_tensor = torch.from_numpy(numpy_array)
+print(pytorch_tensor)
+
+
+##########################################################################
+# PyTorch creates a tensor of the same shape and containing the same data
+# as the NumPy array, going so far as to keep NumPy’s default 64-bit float
+# data type.
+# 
+# The conversion can just as easily go the other way:
+# 
+
+pytorch_rand = torch.rand(2, 3)
+print(pytorch_rand)
+
+numpy_rand = pytorch_rand.numpy()
+print(numpy_rand)
+
+
+##########################################################################
+# It is important to know that these converted objects are using *the same
+# underlying memory* as their source objects, meaning that changes to one
+# are reflected in the other:
+# 
+
+numpy_array[1, 1] = 23
+print(pytorch_tensor)
+
+pytorch_rand[1, 1] = 17
+print(numpy_rand)
diff --git a/beginner_source/introyt/tocyt.txt b/beginner_source/introyt/tocyt.txt
new file mode 100644
index 00000000000..24b47e48913
--- /dev/null
+++ b/beginner_source/introyt/tocyt.txt
@@ -0,0 +1,7 @@
+1. `Introduction to PyTorch <introyt/introyt1_tutorial.html>`_
+2. `Introduction to PyTorch Tensors <introyt/tensors_deeper_tutorial.html>`_
+3. `The Fundamentals of Autograd <introyt/autogradyt_tutorial.html>`_
+4. `Building Models with PyTorch <introyt/modelsyt_tutorial.html>`_
+5. `PyTorch TensorBoard Support <introyt/tensorboardyt_tutorial.html>`_
+6. `Training with PyTorch <introyt/trainingyt.html>`_
+7. `Model Understanding with Captum <introyt/captumyt.html>`_
diff --git a/beginner_source/introyt/trainingyt.py b/beginner_source/introyt/trainingyt.py
new file mode 100644
index 00000000000..d9f585411e8
--- /dev/null
+++ b/beginner_source/introyt/trainingyt.py
@@ -0,0 +1,368 @@
+"""
+`Introduction <introyt1_tutorial.html>`_ ||
+`Tensors <tensors_deeper_tutorial.html>`_ ||
+`Autograd <autogradyt_tutorial.html>`_ ||
+`Building Models <modelsyt_tutorial.html>`_ ||
+`TensorBoard Support <tensorboardyt_tutorial.html>`_ ||
+**Training Models** ||
+`Model Understanding <captumyt.html>`_
+
+Training with PyTorch
+=====================
+
+Follow along with the video below or on `youtube <https://www.youtube.com/watch?v=jF43_wj_DCQ>`__.
+
+.. raw:: html
+
+   <div style="margin-top:10px; margin-bottom:10px;">
+     <iframe width="560" height="315" src="https://www.youtube.com/embed/jF43_wj_DCQ" frameborder="0" allow="accelerometer; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
+   </div>
+
+Introduction
+------------
+
+In past videos, we’ve discussed and demonstrated:
+
+- Building models with the neural network layers and functions of the torch.nn module
+- The mechanics of automated gradient computation, which is central to
+  gradient-based model training 
+- Using TensorBoard to visualize training progress and other activities
+
+In this video, we’ll be adding some new tools to your inventory:
+
+- We’ll get familiar with the dataset and dataloader abstractions, and how
+  they ease the process of feeding data to your model during a training loop 
+- We’ll discuss specific loss functions and when to use them
+- We’ll look at PyTorch optimizers, which implement algorithms to adjust
+  model weights based on the outcome of a loss function
+
+Finally, we’ll pull all of these together and see a full PyTorch
+training loop in action.
+
+
+Dataset and DataLoader
+----------------------
+ 
+The ``Dataset`` and ``DataLoader`` classes encapsulate the process of
+pulling your data from storage and exposing it to your training loop in
+batches.
+
+The ``Dataset`` is responsible for accessing and processing single
+instances of data.
+ 
+The ``DataLoader`` pulls instances of data from the ``Dataset`` (either
+automatically or with a sampler that you define), collects them in
+batches, and returns them for consumption by your training loop. The
+``DataLoader`` works with all kinds of datasets, regardless of the type
+of data they contain.
+ 
+For this tutorial, we’ll be using the Fashion-MNIST dataset provided by
+TorchVision. We use ``torchvision.transforms.Normalize()`` to
+zero-center and normalize the distribution of the image tile content,
+and download both training and validation data splits.
+
+""" 
+
+import torch
+import torchvision
+import torchvision.transforms as transforms
+
+# PyTorch TensorBoard support
+from torch.utils.tensorboard import SummaryWriter
+from datetime import datetime
+
+
+transform = transforms.Compose(
+    [transforms.ToTensor(),
+    transforms.Normalize((0.5,), (0.5,))])
+
+# Create datasets for training & validation, download if necessary
+training_set = torchvision.datasets.FashionMNIST('./data', train=True, transform=transform, download=True)
+validation_set = torchvision.datasets.FashionMNIST('./data', train=False, transform=transform, download=True)
+
+# Create data loaders for our datasets; shuffle for training, not for validation
+training_loader = torch.utils.data.DataLoader(training_set, batch_size=4, shuffle=True)
+validation_loader = torch.utils.data.DataLoader(validation_set, batch_size=4, shuffle=False)
+
+# Class labels
+classes = ('T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat',
+        'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle Boot')
+
+# Report split sizes
+print('Training set has {} instances'.format(len(training_set)))
+print('Validation set has {} instances'.format(len(validation_set)))
+
+
+######################################################################
+# As always, let’s visualize the data as a sanity check:
+# 
+
+import matplotlib.pyplot as plt
+import numpy as np
+
+# Helper function for inline image display
+def matplotlib_imshow(img, one_channel=False):
+    if one_channel:
+        img = img.mean(dim=0)
+    img = img / 2 + 0.5     # unnormalize
+    npimg = img.numpy()
+    if one_channel:
+        plt.imshow(npimg, cmap="Greys")
+    else:
+        plt.imshow(np.transpose(npimg, (1, 2, 0)))
+
+dataiter = iter(training_loader)
+images, labels = next(dataiter)
+
+# Create a grid from the images and show them
+img_grid = torchvision.utils.make_grid(images)
+matplotlib_imshow(img_grid, one_channel=True)
+print('  '.join(classes[labels[j]] for j in range(4)))
+
+
+#########################################################################
+# The Model
+# ---------
+# 
+# The model we’ll use in this example is a variant of LeNet-5 - it should
+# be familiar if you’ve watched the previous videos in this series.
+# 
+
+import torch.nn as nn
+import torch.nn.functional as F
+
+# PyTorch models inherit from torch.nn.Module
+class GarmentClassifier(nn.Module):
+    def __init__(self):
+        super(GarmentClassifier, self).__init__()
+        self.conv1 = nn.Conv2d(1, 6, 5)
+        self.pool = nn.MaxPool2d(2, 2)
+        self.conv2 = nn.Conv2d(6, 16, 5)
+        self.fc1 = nn.Linear(16 * 4 * 4, 120)
+        self.fc2 = nn.Linear(120, 84)
+        self.fc3 = nn.Linear(84, 10)
+
+    def forward(self, x):
+        x = self.pool(F.relu(self.conv1(x)))
+        x = self.pool(F.relu(self.conv2(x)))
+        x = x.view(-1, 16 * 4 * 4)
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        x = self.fc3(x)
+        return x
+    
+
+model = GarmentClassifier()
+
+
+##########################################################################
+# Loss Function
+# -------------
+# 
+# For this example, we’ll be using a cross-entropy loss. For demonstration
+# purposes, we’ll create batches of dummy output and label values, run
+# them through the loss function, and examine the result.
+# 
+
+loss_fn = torch.nn.CrossEntropyLoss()
+
+# NB: Loss functions expect data in batches, so we're creating batches of 4
+# Represents the model's confidence in each of the 10 classes for a given input
+dummy_outputs = torch.rand(4, 10)
+# Represents the correct class among the 10 being tested
+dummy_labels = torch.tensor([1, 5, 3, 7])
+    
+print(dummy_outputs)
+print(dummy_labels)
+
+loss = loss_fn(dummy_outputs, dummy_labels)
+print('Total loss for this batch: {}'.format(loss.item()))
+
+
+#################################################################################
+# Optimizer
+# ---------
+# 
+# For this example, we’ll be using simple `stochastic gradient
+# descent <https://pytorch.org/docs/stable/optim.html>`__ with momentum.
+# 
+# It can be instructive to try some variations on this optimization
+# scheme:
+# 
+# - Learning rate determines the size of the steps the optimizer
+#   takes. What does a different learning rate do to the your training
+#   results, in terms of accuracy and convergence time?
+# - Momentum nudges the optimizer in the direction of strongest gradient over
+#   multiple steps. What does changing this value do to your results? 
+# - Try some different optimization algorithms, such as averaged SGD, Adagrad, or
+#   Adam. How do your results differ?
+# 
+
+# Optimizers specified in the torch.optim package
+optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
+
+
+#######################################################################################
+# The Training Loop
+# -----------------
+# 
+# Below, we have a function that performs one training epoch. It
+# enumerates data from the DataLoader, and on each pass of the loop does
+# the following:
+# 
+# - Gets a batch of training data from the DataLoader
+# - Zeros the optimizer’s gradients 
+# - Performs an inference - that is, gets predictions from the model for an input batch
+# - Calculates the loss for that set of predictions vs. the labels on the dataset
+# - Calculates the backward gradients over the learning weights
+# - Tells the optimizer to perform one learning step - that is, adjust the model’s
+#   learning weights based on the observed gradients for this batch, according to the
+#   optimization algorithm we chose
+# - It reports on the loss for every 1000 batches.
+# - Finally, it reports the average per-batch loss for the last
+#   1000 batches, for comparison with a validation run
+# 
+
+def train_one_epoch(epoch_index, tb_writer):
+    running_loss = 0.
+    last_loss = 0.
+    
+    # Here, we use enumerate(training_loader) instead of
+    # iter(training_loader) so that we can track the batch
+    # index and do some intra-epoch reporting
+    for i, data in enumerate(training_loader):
+        # Every data instance is an input + label pair
+        inputs, labels = data
+        
+        # Zero your gradients for every batch!
+        optimizer.zero_grad()
+        
+        # Make predictions for this batch
+        outputs = model(inputs)
+        
+        # Compute the loss and its gradients
+        loss = loss_fn(outputs, labels)
+        loss.backward()
+        
+        # Adjust learning weights
+        optimizer.step()
+        
+        # Gather data and report
+        running_loss += loss.item()
+        if i % 1000 == 999:
+            last_loss = running_loss / 1000 # loss per batch
+            print('  batch {} loss: {}'.format(i + 1, last_loss))
+            tb_x = epoch_index * len(training_loader) + i + 1
+            tb_writer.add_scalar('Loss/train', last_loss, tb_x)
+            running_loss = 0.
+            
+    return last_loss
+
+
+##################################################################################
+# Per-Epoch Activity
+# ~~~~~~~~~~~~~~~~~~
+# 
+# There are a couple of things we’ll want to do once per epoch: 
+#
+# - Perform validation by checking our relative loss on a set of data that was not
+#   used for training, and report this 
+# - Save a copy of the model
+# 
+# Here, we’ll do our reporting in TensorBoard. This will require going to
+# the command line to start TensorBoard, and opening it in another browser
+# tab.
+# 
+
+# Initializing in a separate cell so we can easily add more epochs to the same run
+timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
+writer = SummaryWriter('runs/fashion_trainer_{}'.format(timestamp))
+epoch_number = 0
+
+EPOCHS = 5
+
+best_vloss = 1_000_000.
+
+for epoch in range(EPOCHS):
+    print('EPOCH {}:'.format(epoch_number + 1))
+    
+    # Make sure gradient tracking is on, and do a pass over the data
+    model.train(True)
+    avg_loss = train_one_epoch(epoch_number, writer)
+    
+
+    running_vloss = 0.0
+    # Set the model to evaluation mode, disabling dropout and using population 
+    # statistics for batch normalization.
+    model.eval()
+
+    # Disable gradient computation and reduce memory consumption.
+    with torch.no_grad():
+        for i, vdata in enumerate(validation_loader):
+            vinputs, vlabels = vdata
+            voutputs = model(vinputs)
+            vloss = loss_fn(voutputs, vlabels)
+            running_vloss += vloss
+    
+    avg_vloss = running_vloss / (i + 1)
+    print('LOSS train {} valid {}'.format(avg_loss, avg_vloss))
+    
+    # Log the running loss averaged per batch
+    # for both training and validation
+    writer.add_scalars('Training vs. Validation Loss',
+                    { 'Training' : avg_loss, 'Validation' : avg_vloss },
+                    epoch_number + 1)
+    writer.flush()
+    
+    # Track best performance, and save the model's state
+    if avg_vloss < best_vloss:
+        best_vloss = avg_vloss
+        model_path = 'model_{}_{}'.format(timestamp, epoch_number)
+        torch.save(model.state_dict(), model_path)
+    
+    epoch_number += 1
+
+
+#########################################################################
+# To load a saved version of the model:
+#
+# .. code:: python
+#
+#     saved_model = GarmentClassifier()
+#     saved_model.load_state_dict(torch.load(PATH))
+#
+# Once you’ve loaded the model, it’s ready for whatever you need it for -
+# more training, inference, or analysis.
+# 
+# Note that if your model has constructor parameters that affect model
+# structure, you’ll need to provide them and configure the model
+# identically to the state in which it was saved.
+# 
+# Other Resources
+# ---------------
+# 
+# -  Docs on the `data
+#    utilities <https://pytorch.org/docs/stable/data.html>`__, including
+#    Dataset and DataLoader, at pytorch.org
+# -  A `note on the use of pinned
+#    memory <https://pytorch.org/docs/stable/notes/cuda.html#cuda-memory-pinning>`__
+#    for GPU training
+# -  Documentation on the datasets available in
+#    `TorchVision <https://pytorch.org/vision/stable/datasets.html>`__,
+#    `TorchText <https://pytorch.org/text/stable/datasets.html>`__, and
+#    `TorchAudio <https://pytorch.org/audio/stable/datasets.html>`__
+# -  Documentation on the `loss
+#    functions <https://pytorch.org/docs/stable/nn.html#loss-functions>`__
+#    available in PyTorch
+# -  Documentation on the `torch.optim
+#    package <https://pytorch.org/docs/stable/optim.html>`__, which
+#    includes optimizers and related tools, such as learning rate
+#    scheduling
+# -  A detailed `tutorial on saving and loading
+#    models <https://pytorch.org/tutorials/beginner/saving_loading_models.html>`__
+# -  The `Tutorials section of
+#    pytorch.org <https://pytorch.org/tutorials/>`__ contains tutorials on
+#    a broad variety of training tasks, including classification in
+#    different domains, generative adversarial networks, reinforcement
+#    learning, and more 
+# 
diff --git a/beginner_source/knowledge_distillation_tutorial.py b/beginner_source/knowledge_distillation_tutorial.py
new file mode 100644
index 00000000000..19d1553e7a0
--- /dev/null
+++ b/beginner_source/knowledge_distillation_tutorial.py
@@ -0,0 +1,740 @@
+# -*- coding: utf-8 -*-
+"""
+Knowledge Distillation Tutorial
+===============================
+**Author**: `Alexandros Chariton <https://github.com/AlexandrosChrtn>`_
+""" 
+
+######################################################################
+# Knowledge distillation is a technique that enables knowledge transfer from large, computationally expensive
+# models to smaller ones without losing validity. This allows for deployment on less powerful
+# hardware, making evaluation faster and more efficient. 
+#
+# In this tutorial, we will run a number of experiments focused at improving the accuracy of a
+# lightweight neural network, using a more powerful network as a teacher.
+# The computational cost and the speed of the lightweight network will remain unaffected,
+# our intervention only focuses on its weights, not on its forward pass.
+# Applications of this technology can be found in devices such as drones or mobile phones.
+# In this tutorial, we do not use any external packages as everything we need is available in ``torch`` and
+# ``torchvision``.
+#
+# In this tutorial, you will learn:
+#
+# - How to modify model classes to extract hidden representations and use them for further calculations
+# - How to modify regular train loops in PyTorch to include additional losses on top of, for example, cross-entropy for classification 
+# - How to improve the performance of lightweight models by using more complex models as teachers
+#
+# Prerequisites
+# ~~~~~~~~~~~~~
+#
+# * 1 GPU, 4GB of memory
+# * PyTorch v2.0 or later 
+# * CIFAR-10 dataset (downloaded by the script and saved in a directory called ``/data``)
+
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import torchvision.transforms as transforms
+import torchvision.datasets as datasets
+
+# Check if the current `accelerator <https://pytorch.org/docs/stable/torch.html#accelerators>`__
+# is available, and if not, use the CPU
+device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu"
+print(f"Using {device} device")
+
+######################################################################
+# Loading CIFAR-10
+# ----------------
+# CIFAR-10 is a popular image dataset with ten classes. Our objective is to predict one of the following classes for each input image.
+#
+# .. figure:: /../_static/img/cifar10.png 
+#    :align: center
+#    
+#    Example of CIFAR-10 images
+#
+# The input images are RGB, so they have 3 channels and are 32x32 pixels. Basically, each image is described by 3 x 32 x 32 = 3072 numbers ranging from 0 to 255.
+# A common practice in neural networks is to normalize the input, which is done for multiple reasons,
+# including avoiding saturation in commonly used activation functions and increasing numerical stability.
+# Our normalization process consists of subtracting the mean and dividing by the standard deviation along each channel.
+# The tensors "mean=[0.485, 0.456, 0.406]" and "std=[0.229, 0.224, 0.225]" were already computed,
+# and they represent the mean and standard deviation of each channel in the
+# predefined subset of CIFAR-10 intended to be the training set.
+# Notice how we use these values for the test set as well, without recomputing the mean and standard deviation from scratch.
+# This is because the network was trained on features produced by subtracting and dividing the numbers above, and we want to maintain consistency.
+# Furthermore, in real life, we would not be able to compute the mean and standard deviation of the test set since,
+# under our assumptions, this data would not be accessible at that point.
+# 
+# As a closing point, we often refer to this held-out set as the validation set, and we use a separate set,
+# called the test set, after optimizing a model's performance on the validation set.
+# This is done to avoid selecting a model based on the greedy and biased optimization of a single metric.
+
+# Below we are preprocessing data for CIFAR-10. We use an arbitrary batch size of 128.
+transforms_cifar = transforms.Compose([
+    transforms.ToTensor(),
+    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+])
+
+# Loading the CIFAR-10 dataset:
+train_dataset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transforms_cifar)
+test_dataset = datasets.CIFAR10(root='./data', train=False, download=True, transform=transforms_cifar)
+
+########################################################################
+# .. note:: This section is for CPU users only who are interested in quick results. Use this option only if you're interested in a small scale experiment. Keep in mind the code should run fairly quickly using any GPU. Select only the first ``num_images_to_keep`` images from the train/test dataset
+#
+#    .. code-block:: python
+#
+#       #from torch.utils.data import Subset
+#       #num_images_to_keep = 2000
+#       #train_dataset = Subset(train_dataset, range(min(num_images_to_keep, 50_000)))
+#       #test_dataset = Subset(test_dataset, range(min(num_images_to_keep, 10_000)))
+
+#Dataloaders
+train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=128, shuffle=True, num_workers=2)
+test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=128, shuffle=False, num_workers=2)
+
+######################################################################
+# Defining model classes and utility functions
+# --------------------------------------------
+# Next, we need to define our model classes. Several user-defined parameters need to be set here. We use two different architectures, keeping the number of filters fixed across our experiments to ensure fair comparisons.
+# Both architectures are Convolutional Neural Networks (CNNs) with a different number of convolutional layers that serve as feature extractors, followed by a classifier with 10 classes. 
+# The number of filters and neurons is smaller for the students.
+
+# Deeper neural network class to be used as teacher:
+class DeepNN(nn.Module):
+    def __init__(self, num_classes=10):
+        super(DeepNN, self).__init__()
+        self.features = nn.Sequential(
+            nn.Conv2d(3, 128, kernel_size=3, padding=1),
+            nn.ReLU(),
+            nn.Conv2d(128, 64, kernel_size=3, padding=1),
+            nn.ReLU(),
+            nn.MaxPool2d(kernel_size=2, stride=2),
+            nn.Conv2d(64, 64, kernel_size=3, padding=1),
+            nn.ReLU(),
+            nn.Conv2d(64, 32, kernel_size=3, padding=1),
+            nn.ReLU(),
+            nn.MaxPool2d(kernel_size=2, stride=2),
+        )
+        self.classifier = nn.Sequential(
+            nn.Linear(2048, 512),
+            nn.ReLU(),
+            nn.Dropout(0.1),
+            nn.Linear(512, num_classes)
+        )
+
+    def forward(self, x):
+        x = self.features(x)
+        x = torch.flatten(x, 1)
+        x = self.classifier(x)
+        return x
+
+# Lightweight neural network class to be used as student:
+class LightNN(nn.Module):
+    def __init__(self, num_classes=10):
+        super(LightNN, self).__init__()
+        self.features = nn.Sequential(
+            nn.Conv2d(3, 16, kernel_size=3, padding=1),
+            nn.ReLU(),
+            nn.MaxPool2d(kernel_size=2, stride=2),
+            nn.Conv2d(16, 16, kernel_size=3, padding=1),
+            nn.ReLU(),
+            nn.MaxPool2d(kernel_size=2, stride=2),
+        )
+        self.classifier = nn.Sequential(
+            nn.Linear(1024, 256),
+            nn.ReLU(),
+            nn.Dropout(0.1),
+            nn.Linear(256, num_classes)
+        )
+
+    def forward(self, x):
+        x = self.features(x)
+        x = torch.flatten(x, 1)
+        x = self.classifier(x)
+        return x
+
+######################################################################
+# We employ 2 functions to help us produce and evaluate the results on our original classification task.
+# One function is called ``train`` and takes the following arguments:
+#
+# - ``model``: A model instance to train (update its weights) via this function.
+# - ``train_loader``: We defined our ``train_loader`` above, and its job is to feed the data into the model.
+# - ``epochs``: How many times we loop over the dataset.
+# - ``learning_rate``: The learning rate determines how large our steps towards convergence should be. Too large or too small steps can be detrimental.
+# - ``device``: Determines the device to run the workload on. Can be either CPU or GPU depending on availability.
+#
+# Our test function is similar, but it will be invoked with ``test_loader`` to load images from the test set.
+#
+# .. figure:: /../_static/img/knowledge_distillation/ce_only.png 
+#    :align: center
+#    
+#    Train both networks with Cross-Entropy. The student will be used as a baseline:
+#
+
+def train(model, train_loader, epochs, learning_rate, device):
+    criterion = nn.CrossEntropyLoss()
+    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
+
+    model.train()
+
+    for epoch in range(epochs):
+        running_loss = 0.0
+        for inputs, labels in train_loader:
+            # inputs: A collection of batch_size images
+            # labels: A vector of dimensionality batch_size with integers denoting class of each image
+            inputs, labels = inputs.to(device), labels.to(device)
+
+            optimizer.zero_grad()
+            outputs = model(inputs)
+
+            # outputs: Output of the network for the collection of images. A tensor of dimensionality batch_size x num_classes
+            # labels: The actual labels of the images. Vector of dimensionality batch_size
+            loss = criterion(outputs, labels)
+            loss.backward()
+            optimizer.step()
+
+            running_loss += loss.item()
+
+        print(f"Epoch {epoch+1}/{epochs}, Loss: {running_loss / len(train_loader)}")
+
+def test(model, test_loader, device):
+    model.to(device)
+    model.eval()
+
+    correct = 0
+    total = 0
+
+    with torch.no_grad():
+        for inputs, labels in test_loader:
+            inputs, labels = inputs.to(device), labels.to(device)
+
+            outputs = model(inputs)
+            _, predicted = torch.max(outputs.data, 1)
+
+            total += labels.size(0)
+            correct += (predicted == labels).sum().item()
+
+    accuracy = 100 * correct / total
+    print(f"Test Accuracy: {accuracy:.2f}%")
+    return accuracy
+
+######################################################################
+# Cross-entropy runs
+# ------------------
+# For reproducibility, we need to set the torch manual seed. We train networks using different methods, so to compare them fairly,
+# it makes sense to initialize the networks with the same weights.
+# Start by training the teacher network using cross-entropy:
+
+torch.manual_seed(42)
+nn_deep = DeepNN(num_classes=10).to(device)
+train(nn_deep, train_loader, epochs=10, learning_rate=0.001, device=device)
+test_accuracy_deep = test(nn_deep, test_loader, device)
+
+# Instantiate the lightweight network:
+torch.manual_seed(42)
+nn_light = LightNN(num_classes=10).to(device)
+
+######################################################################
+# We instantiate one more lightweight network model to compare their performances.
+# Back propagation is sensitive to weight initialization,
+# so we need to make sure these two networks have the exact same initialization.
+
+torch.manual_seed(42)
+new_nn_light = LightNN(num_classes=10).to(device)
+
+######################################################################
+# To ensure we have created a copy of the first network, we inspect the norm of its first layer.
+# If it matches, then we are safe to conclude that the networks are indeed the same.
+
+# Print the norm of the first layer of the initial lightweight model
+print("Norm of 1st layer of nn_light:", torch.norm(nn_light.features[0].weight).item())
+# Print the norm of the first layer of the new lightweight model
+print("Norm of 1st layer of new_nn_light:", torch.norm(new_nn_light.features[0].weight).item())
+
+######################################################################
+# Print the total number of parameters in each model:
+total_params_deep = "{:,}".format(sum(p.numel() for p in nn_deep.parameters()))
+print(f"DeepNN parameters: {total_params_deep}")
+total_params_light = "{:,}".format(sum(p.numel() for p in nn_light.parameters()))
+print(f"LightNN parameters: {total_params_light}")
+
+######################################################################
+# Train and test the lightweight network with cross entropy loss:
+train(nn_light, train_loader, epochs=10, learning_rate=0.001, device=device)
+test_accuracy_light_ce = test(nn_light, test_loader, device)
+
+######################################################################
+# As we can see, based on test accuracy, we can now compare the deeper network that is to be used as a teacher with the lightweight network that is our supposed student. So far, our student has not intervened with the teacher, therefore this performance is achieved by the student itself.
+# The metrics so far can be seen with the following lines:
+
+print(f"Teacher accuracy: {test_accuracy_deep:.2f}%")
+print(f"Student accuracy: {test_accuracy_light_ce:.2f}%")
+
+######################################################################
+# Knowledge distillation run
+# --------------------------
+# Now let's try to improve the test accuracy of the student network by incorporating the teacher.
+# Knowledge distillation is a straightforward technique to achieve this,
+# based on the fact that both networks output a probability distribution over our classes.
+# Therefore, the two networks share the same number of output neurons.
+# The method works by incorporating an additional loss into the traditional cross entropy loss,
+# which is based on the softmax output of the teacher network.
+# The assumption is that the output activations of a properly trained teacher network carry additional information that can be leveraged by a student network during training.
+# The original work suggests that utilizing ratios of smaller probabilities in the soft targets can help achieve the underlying objective of deep neural networks,
+# which is to create a similarity structure over the data where similar objects are mapped closer together.
+# For example, in CIFAR-10, a truck could be mistaken for an automobile or airplane,
+# if its wheels are present, but it is less likely to be mistaken for a dog. 
+# Therefore, it makes sense to assume that valuable information resides not only in the top prediction of a properly trained model but in the entire output distribution.
+# However, cross entropy alone does not sufficiently exploit this information as the activations for non-predicted classes
+# tend to be so small that propagated gradients do not meaningfully change the weights to construct this desirable vector space.
+#
+# As we continue defining our first helper function that introduces a teacher-student dynamic, we need to include a few extra parameters:
+# 
+# - ``T``: Temperature controls the smoothness of the output distributions. Larger ``T`` leads to smoother distributions, thus smaller probabilities get a larger boost.
+# - ``soft_target_loss_weight``: A weight assigned to the extra objective we're about to include.
+# - ``ce_loss_weight``: A weight assigned to cross-entropy. Tuning these weights pushes the network towards optimizing for either objective.
+#
+# .. figure:: /../_static/img/knowledge_distillation/distillation_output_loss.png 
+#    :align: center
+#    
+#    Distillation loss is calculated from the logits of the networks. It only returns gradients to the student:
+#
+
+def train_knowledge_distillation(teacher, student, train_loader, epochs, learning_rate, T, soft_target_loss_weight, ce_loss_weight, device):
+    ce_loss = nn.CrossEntropyLoss()
+    optimizer = optim.Adam(student.parameters(), lr=learning_rate)
+
+    teacher.eval()  # Teacher set to evaluation mode
+    student.train() # Student to train mode
+
+    for epoch in range(epochs):
+        running_loss = 0.0
+        for inputs, labels in train_loader:
+            inputs, labels = inputs.to(device), labels.to(device)
+
+            optimizer.zero_grad()
+
+            # Forward pass with the teacher model - do not save gradients here as we do not change the teacher's weights
+            with torch.no_grad():
+                teacher_logits = teacher(inputs)
+
+            # Forward pass with the student model
+            student_logits = student(inputs)
+
+            #Soften the student logits by applying softmax first and log() second
+            soft_targets = nn.functional.softmax(teacher_logits / T, dim=-1)
+            soft_prob = nn.functional.log_softmax(student_logits / T, dim=-1)
+
+            # Calculate the soft targets loss. Scaled by T**2 as suggested by the authors of the paper "Distilling the knowledge in a neural network"
+            soft_targets_loss = torch.sum(soft_targets * (soft_targets.log() - soft_prob)) / soft_prob.size()[0] * (T**2)
+
+            # Calculate the true label loss
+            label_loss = ce_loss(student_logits, labels)
+
+            # Weighted sum of the two losses
+            loss = soft_target_loss_weight * soft_targets_loss + ce_loss_weight * label_loss
+
+            loss.backward()
+            optimizer.step()
+
+            running_loss += loss.item()
+
+        print(f"Epoch {epoch+1}/{epochs}, Loss: {running_loss / len(train_loader)}")
+
+# Apply ``train_knowledge_distillation`` with a temperature of 2. Arbitrarily set the weights to 0.75 for CE and 0.25 for distillation loss.
+train_knowledge_distillation(teacher=nn_deep, student=new_nn_light, train_loader=train_loader, epochs=10, learning_rate=0.001, T=2, soft_target_loss_weight=0.25, ce_loss_weight=0.75, device=device)
+test_accuracy_light_ce_and_kd = test(new_nn_light, test_loader, device)
+
+# Compare the student test accuracy with and without the teacher, after distillation
+print(f"Teacher accuracy: {test_accuracy_deep:.2f}%")
+print(f"Student accuracy without teacher: {test_accuracy_light_ce:.2f}%")
+print(f"Student accuracy with CE + KD: {test_accuracy_light_ce_and_kd:.2f}%")
+
+######################################################################
+# Cosine loss minimization run
+# ----------------------------
+# Feel free to play around with the temperature parameter that controls the softness of the softmax function and the loss coefficients.
+# In neural networks, it is easy to include additional loss functions to the main objectives to achieve goals like better generalization.
+# Let's try including an objective for the student, but now let's focus on their hidden states rather than their output layers.
+# Our goal is to convey information from the teacher's representation to the student by including a naive loss function,
+# whose minimization implies that the flattened vectors that are subsequently passed to the classifiers have become more *similar* as the loss decreases.
+# Of course, the teacher does not update its weights, so the minimization depends only on the student's weights.
+# The rationale behind this method is that we are operating under the assumption that the teacher model has a better internal representation that is
+# unlikely to be achieved by the student without external intervention, therefore we artificially push the student to mimic the internal representation of the teacher.
+# Whether or not this will end up helping the student is not straightforward, though, because pushing the lightweight network
+# to reach this point could be a good thing, assuming that we have found an internal representation that leads to better test accuracy,
+# but it could also be harmful because the networks have different architectures and the student does not have the same learning capacity as the teacher.
+# In other words, there is no reason for these two vectors, the student's and the teacher's to match per component.
+# The student could reach an internal representation that is a permutation of the teacher's and it would be just as efficient.
+# Nonetheless, we can still run a quick experiment to figure out the impact of this method.
+# We will be using the ``CosineEmbeddingLoss`` which is given by the following formula:
+#
+# .. figure:: /../_static/img/knowledge_distillation/cosine_embedding_loss.png 
+#    :align: center
+#    :width: 450px
+#    
+#    Formula for CosineEmbeddingLoss
+#
+# Obviously, there is one thing that we need to resolve first.
+# When we applied distillation to the output layer we mentioned that both networks have the same number of neurons, equal to the number of classes.
+# However, this is not the case for the layer following our convolutional layers. Here, the teacher has more neurons than the student
+# after the flattening of the final convolutional layer. Our loss function accepts two vectors of equal dimensionality as inputs,
+# therefore we need to somehow match them. We will solve this by including an average pooling layer after the teacher's convolutional layer to reduce its dimensionality to match that of the student.
+#
+# To proceed, we will modify our model classes, or create new ones.
+# Now, the forward function returns not only the logits of the network but also the flattened hidden representation after the convolutional layer. We include the aforementioned pooling for the modified teacher.
+
+class ModifiedDeepNNCosine(nn.Module):
+    def __init__(self, num_classes=10):
+        super(ModifiedDeepNNCosine, self).__init__()
+        self.features = nn.Sequential(
+            nn.Conv2d(3, 128, kernel_size=3, padding=1),
+            nn.ReLU(),
+            nn.Conv2d(128, 64, kernel_size=3, padding=1),
+            nn.ReLU(),
+            nn.MaxPool2d(kernel_size=2, stride=2),
+            nn.Conv2d(64, 64, kernel_size=3, padding=1),
+            nn.ReLU(),
+            nn.Conv2d(64, 32, kernel_size=3, padding=1),
+            nn.ReLU(),
+            nn.MaxPool2d(kernel_size=2, stride=2),
+        )
+        self.classifier = nn.Sequential(
+            nn.Linear(2048, 512),
+            nn.ReLU(),
+            nn.Dropout(0.1),
+            nn.Linear(512, num_classes)
+        )
+
+    def forward(self, x):
+        x = self.features(x)
+        flattened_conv_output = torch.flatten(x, 1)
+        x = self.classifier(flattened_conv_output)
+        flattened_conv_output_after_pooling = torch.nn.functional.avg_pool1d(flattened_conv_output, 2)
+        return x, flattened_conv_output_after_pooling
+
+# Create a similar student class where we return a tuple. We do not apply pooling after flattening.
+class ModifiedLightNNCosine(nn.Module):
+    def __init__(self, num_classes=10):
+        super(ModifiedLightNNCosine, self).__init__()
+        self.features = nn.Sequential(
+            nn.Conv2d(3, 16, kernel_size=3, padding=1),
+            nn.ReLU(),
+            nn.MaxPool2d(kernel_size=2, stride=2),
+            nn.Conv2d(16, 16, kernel_size=3, padding=1),
+            nn.ReLU(),
+            nn.MaxPool2d(kernel_size=2, stride=2),
+        )
+        self.classifier = nn.Sequential(
+            nn.Linear(1024, 256),
+            nn.ReLU(),
+            nn.Dropout(0.1),
+            nn.Linear(256, num_classes)
+        )
+
+    def forward(self, x):
+        x = self.features(x)
+        flattened_conv_output = torch.flatten(x, 1)
+        x = self.classifier(flattened_conv_output)
+        return x, flattened_conv_output
+
+# We do not have to train the modified deep network from scratch of course, we just load its weights from the trained instance
+modified_nn_deep = ModifiedDeepNNCosine(num_classes=10).to(device)
+modified_nn_deep.load_state_dict(nn_deep.state_dict())
+
+# Once again ensure the norm of the first layer is the same for both networks
+print("Norm of 1st layer for deep_nn:", torch.norm(nn_deep.features[0].weight).item())
+print("Norm of 1st layer for modified_deep_nn:", torch.norm(modified_nn_deep.features[0].weight).item())
+
+# Initialize a modified lightweight network with the same seed as our other lightweight instances. This will be trained from scratch to examine the effectiveness of cosine loss minimization.
+torch.manual_seed(42)
+modified_nn_light = ModifiedLightNNCosine(num_classes=10).to(device)
+print("Norm of 1st layer:", torch.norm(modified_nn_light.features[0].weight).item())
+
+######################################################################
+# Naturally, we need to change the train loop because now the model returns a tuple ``(logits, hidden_representation)``. Using a sample input tensor
+# we can print their shapes.
+
+# Create a sample input tensor
+sample_input = torch.randn(128, 3, 32, 32).to(device) # Batch size: 128, Filters: 3, Image size: 32x32
+
+# Pass the input through the student
+logits, hidden_representation = modified_nn_light(sample_input)
+
+# Print the shapes of the tensors
+print("Student logits shape:", logits.shape) # batch_size x total_classes
+print("Student hidden representation shape:", hidden_representation.shape) # batch_size x hidden_representation_size
+
+# Pass the input through the teacher
+logits, hidden_representation = modified_nn_deep(sample_input)
+
+# Print the shapes of the tensors
+print("Teacher logits shape:", logits.shape) # batch_size x total_classes
+print("Teacher hidden representation shape:", hidden_representation.shape) # batch_size x hidden_representation_size
+
+######################################################################
+# In our case, ``hidden_representation_size`` is ``1024``. This is the flattened feature map of the final convolutional layer of the student and as you can see,
+# it is the input for its classifier. It is ``1024`` for the teacher too, because we made it so with ``avg_pool1d`` from ``2048``.
+# The loss applied here only affects the weights of the student prior to the loss calculation. In other words, it does not affect the classifier of the student.
+# The modified training loop is the following:
+#
+# .. figure:: /../_static/img/knowledge_distillation/cosine_loss_distillation.png 
+#    :align: center
+#    
+#    In Cosine Loss minimization, we want to maximize the cosine similarity of the two representations by returning gradients to the student:
+#
+
+def train_cosine_loss(teacher, student, train_loader, epochs, learning_rate, hidden_rep_loss_weight, ce_loss_weight, device):
+    ce_loss = nn.CrossEntropyLoss()
+    cosine_loss = nn.CosineEmbeddingLoss()
+    optimizer = optim.Adam(student.parameters(), lr=learning_rate)
+
+    teacher.to(device)
+    student.to(device)
+    teacher.eval()  # Teacher set to evaluation mode
+    student.train() # Student to train mode
+
+    for epoch in range(epochs):
+        running_loss = 0.0
+        for inputs, labels in train_loader:
+            inputs, labels = inputs.to(device), labels.to(device)
+
+            optimizer.zero_grad()
+
+            # Forward pass with the teacher model and keep only the hidden representation
+            with torch.no_grad():
+                _, teacher_hidden_representation = teacher(inputs)
+
+            # Forward pass with the student model
+            student_logits, student_hidden_representation = student(inputs)
+
+            # Calculate the cosine loss. Target is a vector of ones. From the loss formula above we can see that is the case where loss minimization leads to cosine similarity increase.
+            hidden_rep_loss = cosine_loss(student_hidden_representation, teacher_hidden_representation, target=torch.ones(inputs.size(0)).to(device))
+
+            # Calculate the true label loss
+            label_loss = ce_loss(student_logits, labels)
+
+            # Weighted sum of the two losses
+            loss = hidden_rep_loss_weight * hidden_rep_loss + ce_loss_weight * label_loss
+
+            loss.backward()
+            optimizer.step()
+
+            running_loss += loss.item()
+
+        print(f"Epoch {epoch+1}/{epochs}, Loss: {running_loss / len(train_loader)}")
+
+######################################################################
+#We need to modify our test function for the same reason. Here we ignore the hidden representation returned by the model.
+
+def test_multiple_outputs(model, test_loader, device):
+    model.to(device)
+    model.eval()
+
+    correct = 0
+    total = 0
+
+    with torch.no_grad():
+        for inputs, labels in test_loader:
+            inputs, labels = inputs.to(device), labels.to(device)
+
+            outputs, _ = model(inputs) # Disregard the second tensor of the tuple
+            _, predicted = torch.max(outputs.data, 1)
+
+            total += labels.size(0)
+            correct += (predicted == labels).sum().item()
+
+    accuracy = 100 * correct / total
+    print(f"Test Accuracy: {accuracy:.2f}%")
+    return accuracy
+
+######################################################################
+# In this case, we could easily include both knowledge distillation and cosine loss minimization in the same function. It is common to combine methods to achieve better performance in teacher-student paradigms.
+# For now, we can run a simple train-test session.
+
+# Train and test the lightweight network with cross entropy loss
+train_cosine_loss(teacher=modified_nn_deep, student=modified_nn_light, train_loader=train_loader, epochs=10, learning_rate=0.001, hidden_rep_loss_weight=0.25, ce_loss_weight=0.75, device=device)
+test_accuracy_light_ce_and_cosine_loss = test_multiple_outputs(modified_nn_light, test_loader, device)
+
+######################################################################
+# Intermediate regressor run
+# --------------------------
+# Our naive minimization does not guarantee better results for several reasons, one being the dimensionality of the vectors.
+# Cosine similarity generally works better than Euclidean distance for vectors of higher dimensionality,
+# but we were dealing with vectors with 1024 components each, so it is much harder to extract meaningful similarities.
+# Furthermore, as we mentioned, pushing towards a match of the hidden representation of the teacher and the student is not supported by theory.
+# There are no good reasons why we should be aiming for a 1:1 match of these vectors.
+# We will provide a final example of training intervention by including an extra network called regressor.
+# The objective is to first extract the feature map of the teacher after a convolutional layer,
+# then extract a feature map of the student after a convolutional layer, and finally try to match these maps.
+# However, this time, we will introduce a regressor between the networks to facilitate the matching process.
+# The regressor will be trainable and ideally will do a better job than our naive cosine loss minimization scheme.
+# Its main job is to match the dimensionality of these feature maps so that we can properly define a loss function between the teacher and the student.
+# Defining such a loss function provides a teaching "path," which is basically a flow to back-propagate gradients that will change the student's weights.
+# Focusing on the output of the convolutional layers right before each classifier for our original networks, we have the following shapes:
+#
+
+# Pass the sample input only from the convolutional feature extractor
+convolutional_fe_output_student = nn_light.features(sample_input)
+convolutional_fe_output_teacher = nn_deep.features(sample_input)
+
+# Print their shapes
+print("Student's feature extractor output shape: ", convolutional_fe_output_student.shape)
+print("Teacher's feature extractor output shape: ", convolutional_fe_output_teacher.shape)
+
+######################################################################
+# We have 32 filters for the teacher and 16 filters for the student.
+# We will include a trainable layer that converts the feature map of the student to the shape of the feature map of the teacher.
+# In practice, we modify the lightweight class to return the hidden state after an intermediate regressor that matches the sizes of the convolutional
+# feature maps and the teacher class to return the output of the final convolutional layer without pooling or flattening.
+#
+# .. figure:: /../_static/img/knowledge_distillation/fitnets_knowledge_distill.png 
+#    :align: center
+#    
+#    The trainable layer matches the shapes of the intermediate tensors and Mean Squared Error (MSE) is properly defined:
+#
+
+class ModifiedDeepNNRegressor(nn.Module):
+    def __init__(self, num_classes=10):
+        super(ModifiedDeepNNRegressor, self).__init__()
+        self.features = nn.Sequential(
+            nn.Conv2d(3, 128, kernel_size=3, padding=1),
+            nn.ReLU(),
+            nn.Conv2d(128, 64, kernel_size=3, padding=1),
+            nn.ReLU(),
+            nn.MaxPool2d(kernel_size=2, stride=2),
+            nn.Conv2d(64, 64, kernel_size=3, padding=1),
+            nn.ReLU(),
+            nn.Conv2d(64, 32, kernel_size=3, padding=1),
+            nn.ReLU(),
+            nn.MaxPool2d(kernel_size=2, stride=2),
+        )
+        self.classifier = nn.Sequential(
+            nn.Linear(2048, 512),
+            nn.ReLU(),
+            nn.Dropout(0.1),
+            nn.Linear(512, num_classes)
+        )
+
+    def forward(self, x):
+        x = self.features(x)
+        conv_feature_map = x
+        x = torch.flatten(x, 1)
+        x = self.classifier(x)
+        return x, conv_feature_map
+
+class ModifiedLightNNRegressor(nn.Module):
+    def __init__(self, num_classes=10):
+        super(ModifiedLightNNRegressor, self).__init__()
+        self.features = nn.Sequential(
+            nn.Conv2d(3, 16, kernel_size=3, padding=1),
+            nn.ReLU(),
+            nn.MaxPool2d(kernel_size=2, stride=2),
+            nn.Conv2d(16, 16, kernel_size=3, padding=1),
+            nn.ReLU(),
+            nn.MaxPool2d(kernel_size=2, stride=2),
+        )
+        # Include an extra regressor (in our case linear)
+        self.regressor = nn.Sequential(
+            nn.Conv2d(16, 32, kernel_size=3, padding=1)
+        )
+        self.classifier = nn.Sequential(
+            nn.Linear(1024, 256),
+            nn.ReLU(),
+            nn.Dropout(0.1),
+            nn.Linear(256, num_classes)
+        )
+
+    def forward(self, x):
+        x = self.features(x)
+        regressor_output = self.regressor(x)
+        x = torch.flatten(x, 1)
+        x = self.classifier(x)
+        return x, regressor_output
+
+######################################################################
+# After that, we have to update our train loop again. This time, we extract the regressor output of the student, the feature map of the teacher,
+# we calculate the ``MSE`` on these tensors (they have the exact same shape so it's properly defined) and we back propagate gradients based on that loss,
+# in addition to the regular cross entropy loss of the classification task.
+
+def train_mse_loss(teacher, student, train_loader, epochs, learning_rate, feature_map_weight, ce_loss_weight, device):
+    ce_loss = nn.CrossEntropyLoss()
+    mse_loss = nn.MSELoss()
+    optimizer = optim.Adam(student.parameters(), lr=learning_rate)
+
+    teacher.to(device)
+    student.to(device)
+    teacher.eval()  # Teacher set to evaluation mode
+    student.train() # Student to train mode
+
+    for epoch in range(epochs):
+        running_loss = 0.0
+        for inputs, labels in train_loader:
+            inputs, labels = inputs.to(device), labels.to(device)
+
+            optimizer.zero_grad()
+
+            # Again ignore teacher logits
+            with torch.no_grad():
+                _, teacher_feature_map = teacher(inputs)
+
+            # Forward pass with the student model
+            student_logits, regressor_feature_map = student(inputs)
+
+            # Calculate the loss
+            hidden_rep_loss = mse_loss(regressor_feature_map, teacher_feature_map)
+
+            # Calculate the true label loss
+            label_loss = ce_loss(student_logits, labels)
+
+            # Weighted sum of the two losses
+            loss = feature_map_weight * hidden_rep_loss + ce_loss_weight * label_loss
+
+            loss.backward()
+            optimizer.step()
+
+            running_loss += loss.item()
+
+        print(f"Epoch {epoch+1}/{epochs}, Loss: {running_loss / len(train_loader)}")
+
+# Notice how our test function remains the same here with the one we used in our previous case. We only care about the actual outputs because we measure accuracy.
+
+# Initialize a ModifiedLightNNRegressor
+torch.manual_seed(42)
+modified_nn_light_reg = ModifiedLightNNRegressor(num_classes=10).to(device)
+
+# We do not have to train the modified deep network from scratch of course, we just load its weights from the trained instance
+modified_nn_deep_reg = ModifiedDeepNNRegressor(num_classes=10).to(device)
+modified_nn_deep_reg.load_state_dict(nn_deep.state_dict())
+
+# Train and test once again
+train_mse_loss(teacher=modified_nn_deep_reg, student=modified_nn_light_reg, train_loader=train_loader, epochs=10, learning_rate=0.001, feature_map_weight=0.25, ce_loss_weight=0.75, device=device)
+test_accuracy_light_ce_and_mse_loss = test_multiple_outputs(modified_nn_light_reg, test_loader, device)
+
+######################################################################
+# It is expected that the final method will work better than ``CosineLoss`` because now we have allowed a trainable layer between the teacher and the student,
+# which gives the student some wiggle room when it comes to learning, rather than pushing the student to copy the teacher's representation.
+# Including the extra network is the idea behind hint-based distillation.
+
+print(f"Teacher accuracy: {test_accuracy_deep:.2f}%")
+print(f"Student accuracy without teacher: {test_accuracy_light_ce:.2f}%")
+print(f"Student accuracy with CE + KD: {test_accuracy_light_ce_and_kd:.2f}%")
+print(f"Student accuracy with CE + CosineLoss: {test_accuracy_light_ce_and_cosine_loss:.2f}%")
+print(f"Student accuracy with CE + RegressorMSE: {test_accuracy_light_ce_and_mse_loss:.2f}%")
+
+######################################################################
+# Conclusion
+# --------------------------------------------
+# None of the methods above increases the number of parameters for the network or inference time,
+# so the performance increase comes at the little cost of calculating gradients during training.
+# In ML applications, we mostly care about inference time because training happens before the model deployment.
+# If our lightweight model is still too heavy for deployment, we can apply different ideas, such as post-training quantization.
+# Additional losses can be applied in many tasks, not just classification, and you can experiment with quantities like coefficients,
+# temperature, or number of neurons. Feel free to tune any numbers in the tutorial above,
+# but keep in mind, if you change the number of neurons / filters chances are a shape mismatch might occur.
+#
+# For more information, see:
+#
+# * `Hinton, G., Vinyals, O., Dean, J.: Distilling the knowledge in a neural network. In: Neural Information Processing System Deep Learning Workshop (2015) <https://arxiv.org/abs/1503.02531>`_
+#
+# * `Romero, A., Ballas, N., Kahou, S.E., Chassang, A., Gatta, C., Bengio, Y.: Fitnets: Hints for thin deep nets. In: Proceedings of the International Conference on Learning Representations (2015) <https://arxiv.org/abs/1412.6550>`_
diff --git a/beginner_source/nlp/README.txt b/beginner_source/nlp/README.txt
index 8880ec86fb6..b2f7b19145e 100644
--- a/beginner_source/nlp/README.txt
+++ b/beginner_source/nlp/README.txt
@@ -1,22 +1,44 @@
 Deep Learning for NLP with Pytorch
 ----------------------------------
 
+These tutorials will walk you through the key ideas of deep learning
+programming using Pytorch. Many of the concepts (such as the computation
+graph abstraction and autograd) are not unique to Pytorch and are
+relevant to any deep learning toolkit out there.
+
+They are focused specifically on NLP for people who have never written
+code in any deep learning framework (e.g, TensorFlow,Theano, Keras, DyNet).
+The tutorials assumes working knowledge of core NLP problems: part-of-speech
+tagging, language modeling, etc. It also assumes familiarity with neural
+networks at the level of an intro AI class (such as one from the Russel and
+Norvig book). Usually, these courses cover the basic backpropagation algorithm
+on feed-forward neural networks, and make the point that they are chains of
+compositions of linearities and non-linearities. This tutorial aims to get
+you started writing deep learning code, given you have this prerequisite
+knowledge.
+
+Note these tutorials are about *models*, not data. For all of the models,
+a few test examples are created with small dimensionality so you can see how
+the weights change as it trains. If you have some real data you want to
+try, you should be able to rip out any of the models from this notebook
+and use them on it.
+
 1. pytorch_tutorial.py
 	Introduction to PyTorch
-	http://pytorch.org/tutorials/beginner/nlp/pytorch_tutorial.html
+	https://pytorch.org/tutorials/beginner/nlp/pytorch_tutorial.html
 
 2. deep_learning_tutorial.py
 	Deep Learning with PyTorch
-	http://pytorch.org/tutorials/beginner/nlp/deep_learning_tutorial.html
+	https://pytorch.org/tutorials/beginner/nlp/deep_learning_tutorial.html
 
 3. word_embeddings_tutorial.py
 	Word Embeddings: Encoding Lexical Semantics
-	http://pytorch.org/tutorials/beginner/nlp/word_embeddings_tutorial.html
+	https://pytorch.org/tutorials/beginner/nlp/word_embeddings_tutorial.html
 
 4. sequence_models_tutorial.py
-	Sequence Models and Long-Short Term Memory Networks
-	http://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html
+	Sequence Models and Long Short-Term Memory Networks
+	https://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html
 
 5. advanced_tutorial.py
 	Advanced: Making Dynamic Decisions and the Bi-LSTM CRF
-	http://pytorch.org/tutorials/beginner/nlp/advanced_tutorial.html
\ No newline at end of file
+	https://pytorch.org/tutorials/beginner/nlp/advanced_tutorial.html
diff --git a/beginner_source/nlp/advanced_tutorial.py b/beginner_source/nlp/advanced_tutorial.py
index 424ca4ebc41..a6c6857128a 100644
--- a/beginner_source/nlp/advanced_tutorial.py
+++ b/beginner_source/nlp/advanced_tutorial.py
@@ -56,7 +56,7 @@
 above is typically sufficient for part-of-speech tagging, but a sequence
 model like the CRF is really essential for strong performance on NER.
 Familiarity with CRF's is assumed. Although this name sounds scary, all
-the model is is a CRF but where an LSTM provides the features. This is
+the model is a CRF but where an LSTM provides the features. This is
 an advanced model though, far more complicated than any earlier model in
 this tutorial. If you want to skip it, that is fine. To see if you're
 ready, see if you can:
@@ -127,21 +127,15 @@
 # Helper functions to make the code more readable.
 
 
-def to_scalar(var):
-    # returns a python float
-    return var.view(-1).data.tolist()[0]
-
-
 def argmax(vec):
     # return the argmax as a python int
     _, idx = torch.max(vec, 1)
-    return to_scalar(idx)
+    return idx.item()
 
 
 def prepare_sequence(seq, to_ix):
     idxs = [to_ix[w] for w in seq]
-    tensor = torch.LongTensor(idxs)
-    return autograd.Variable(tensor)
+    return torch.tensor(idxs, dtype=torch.long)
 
 
 # Compute log sum exp in a numerically stable way for the forward algorithm
@@ -177,24 +171,29 @@ def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim):
         self.transitions = nn.Parameter(
             torch.randn(self.tagset_size, self.tagset_size))
 
+        # These two statements enforce the constraint that we never transfer
+        # to the start tag and we never transfer from the stop tag
+        self.transitions.data[tag_to_ix[START_TAG], :] = -10000
+        self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000
+
         self.hidden = self.init_hidden()
 
     def init_hidden(self):
-        return (autograd.Variable(torch.randn(2, 1, self.hidden_dim // 2)),
-                autograd.Variable(torch.randn(2, 1, self.hidden_dim // 2)))
+        return (torch.randn(2, 1, self.hidden_dim // 2),
+                torch.randn(2, 1, self.hidden_dim // 2))
 
     def _forward_alg(self, feats):
         # Do the forward algorithm to compute the partition function
-        init_alphas = torch.Tensor(1, self.tagset_size).fill_(-10000.)
+        init_alphas = torch.full((1, self.tagset_size), -10000.)
         # START_TAG has all of the score.
         init_alphas[0][self.tag_to_ix[START_TAG]] = 0.
 
         # Wrap in a variable so that we will get automatic backprop
-        forward_var = autograd.Variable(init_alphas)
+        forward_var = init_alphas
 
         # Iterate through the sentence
         for feat in feats:
-            alphas_t = []  # The forward variables at this timestep
+            alphas_t = []  # The forward tensors at this timestep
             for next_tag in range(self.tagset_size):
                 # broadcast the emission score: it is the same regardless of
                 # the previous tag
@@ -208,7 +207,7 @@ def _forward_alg(self, feats):
                 next_tag_var = forward_var + trans_score + emit_score
                 # The forward variable for this tag is log-sum-exp of all the
                 # scores.
-                alphas_t.append(log_sum_exp(next_tag_var))
+                alphas_t.append(log_sum_exp(next_tag_var).view(1))
             forward_var = torch.cat(alphas_t).view(1, -1)
         terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
         alpha = log_sum_exp(terminal_var)
@@ -224,8 +223,8 @@ def _get_lstm_features(self, sentence):
 
     def _score_sentence(self, feats, tags):
         # Gives the score of a provided tag sequence
-        score = autograd.Variable(torch.Tensor([0]))
-        tags = torch.cat([torch.LongTensor([self.tag_to_ix[START_TAG]]), tags])
+        score = torch.zeros(1)
+        tags = torch.cat([torch.tensor([self.tag_to_ix[START_TAG]], dtype=torch.long), tags])
         for i, feat in enumerate(feats):
             score = score + \
                 self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
@@ -236,11 +235,11 @@ def _viterbi_decode(self, feats):
         backpointers = []
 
         # Initialize the viterbi variables in log space
-        init_vvars = torch.Tensor(1, self.tagset_size).fill_(-10000.)
+        init_vvars = torch.full((1, self.tagset_size), -10000.)
         init_vvars[0][self.tag_to_ix[START_TAG]] = 0
 
         # forward_var at step i holds the viterbi variables for step i-1
-        forward_var = autograd.Variable(init_vvars)
+        forward_var = init_vvars
         for feat in feats:
             bptrs_t = []  # holds the backpointers for this step
             viterbivars_t = []  # holds the viterbi variables for this step
@@ -254,7 +253,7 @@ def _viterbi_decode(self, feats):
                 next_tag_var = forward_var + self.transitions[next_tag]
                 best_tag_id = argmax(next_tag_var)
                 bptrs_t.append(best_tag_id)
-                viterbivars_t.append(next_tag_var[0][best_tag_id])
+                viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
             # Now add in the emission scores, and assign forward_var to the set
             # of viterbi variables we just computed
             forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
@@ -320,9 +319,10 @@ def forward(self, sentence):  # dont confuse this with _forward_alg above.
 optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)
 
 # Check predictions before training
-precheck_sent = prepare_sequence(training_data[0][0], word_to_ix)
-precheck_tags = torch.LongTensor([tag_to_ix[t] for t in training_data[0][1]])
-print(model(precheck_sent))
+with torch.no_grad():
+    precheck_sent = prepare_sequence(training_data[0][0], word_to_ix)
+    precheck_tags = torch.tensor([tag_to_ix[t] for t in training_data[0][1]], dtype=torch.long)
+    print(model(precheck_sent))
 
 # Make sure prepare_sequence from earlier in the LSTM section is loaded
 for epoch in range(
@@ -333,21 +333,22 @@ def forward(self, sentence):  # dont confuse this with _forward_alg above.
         model.zero_grad()
 
         # Step 2. Get our inputs ready for the network, that is,
-        # turn them into Variables of word indices.
+        # turn them into Tensors of word indices.
         sentence_in = prepare_sequence(sentence, word_to_ix)
-        targets = torch.LongTensor([tag_to_ix[t] for t in tags])
+        targets = torch.tensor([tag_to_ix[t] for t in tags], dtype=torch.long)
 
         # Step 3. Run our forward pass.
-        neg_log_likelihood = model.neg_log_likelihood(sentence_in, targets)
+        loss = model.neg_log_likelihood(sentence_in, targets)
 
         # Step 4. Compute the loss, gradients, and update the parameters by
         # calling optimizer.step()
-        neg_log_likelihood.backward()
+        loss.backward()
         optimizer.step()
 
 # Check predictions after training
-precheck_sent = prepare_sequence(training_data[0][0], word_to_ix)
-print(model(precheck_sent))
+with torch.no_grad():
+    precheck_sent = prepare_sequence(training_data[0][0], word_to_ix)
+    print(model(precheck_sent))
 # We got it!
 
 
diff --git a/beginner_source/nlp/deep_learning_tutorial.py b/beginner_source/nlp/deep_learning_tutorial.py
index 6cca0108c82..49d14f61691 100644
--- a/beginner_source/nlp/deep_learning_tutorial.py
+++ b/beginner_source/nlp/deep_learning_tutorial.py
@@ -25,7 +25,7 @@
 as the *bias* term.
 
 
-Pytorch and most other deep learning frameworks do things a little
+PyTorch and most other deep learning frameworks do things a little
 differently than traditional linear algebra. It maps the rows of the
 input instead of the columns. That is, the :math:`i`'th row of the
 output below is the mapping of the :math:`i`'th row of the input under
@@ -36,7 +36,6 @@
 # Author: Robert Guthrie
 
 import torch
-import torch.autograd as autograd
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.optim as optim
@@ -48,7 +47,7 @@
 
 lin = nn.Linear(5, 3)  # maps from R^5 to R^3, parameters A, b
 # data is 2x5.  A maps from 5 to 3... can we map "data" under A?
-data = autograd.Variable(torch.randn(2, 5))
+data = torch.randn(2, 5)
 print(lin(data))  # yes
 
 
@@ -93,7 +92,7 @@
 # In pytorch, most non-linearities are in torch.functional (we have it imported as F)
 # Note that non-linearites typically don't have parameters like affine maps do.
 # That is, they don't have weights that are updated during training.
-data = autograd.Variable(torch.randn(2, 2))
+data = torch.randn(2, 2)
 print(data)
 print(F.relu(data))
 
@@ -120,12 +119,12 @@
 # then dividing by the normalization constant.
 #
 
-# Softmax is also in torch.functional
-data = autograd.Variable(torch.randn(5))
+# Softmax is also in torch.nn.functional
+data = torch.randn(5)
 print(data)
-print(F.softmax(data))
-print(F.softmax(data).sum())  # Sums to 1 because it is a distribution!
-print(F.log_softmax(data))  # theres also log_softmax
+print(F.softmax(data, dim=0))
+print(F.softmax(data, dim=0).sum())  # Sums to 1 because it is a distribution!
+print(F.log_softmax(data, dim=0))  # theres also log_softmax
 
 
 ######################################################################
@@ -158,9 +157,9 @@
 # =========================
 #
 # So what we can compute a loss function for an instance? What do we do
-# with that? We saw earlier that autograd.Variable's know how to compute
-# gradients with respect to the things that were used to compute it. Well,
-# since our loss is an autograd.Variable, we can compute gradients with
+# with that? We saw earlier that Tensors know how to compute gradients
+# with respect to the things that were used to compute it. Well,
+# since our loss is an Tensor, we can compute gradients with
 # respect to all of the parameters used to compute it! Then we can perform
 # standard gradient updates. Let :math:`\theta` be our parameters,
 # :math:`L(\theta)` the loss function, and :math:`\eta` a positive
@@ -172,7 +171,7 @@
 # attempting to do something more than just this vanilla gradient update.
 # Many attempt to vary the learning rate based on what is happening at
 # train time. You don't need to worry about what specifically these
-# algorithms are doing unless you are really interested. Torch provies
+# algorithms are doing unless you are really interested. Torch provides
 # many in the torch.optim package, and they are all completely
 # transparent. Using the simplest gradient update is the same as the more
 # complicated algorithms. Trying different update algorithms and different
@@ -184,21 +183,22 @@
 
 
 ######################################################################
-# Creating Network Components in Pytorch
+# Creating Network Components in PyTorch
 # ======================================
 #
 # Before we move on to our focus on NLP, lets do an annotated example of
-# building a network in Pytorch using only affine maps and
+# building a network in PyTorch using only affine maps and
 # non-linearities. We will also see how to compute a loss function, using
-# Pytorch's built in negative log likelihood, and update parameters by
+# PyTorch's built in negative log likelihood, and update parameters by
 # backpropagation.
 #
 # All network components should inherit from nn.Module and override the
 # forward() method. That is about it, as far as the boilerplate is
 # concerned. Inheriting from nn.Module provides functionality to your
 # component. For example, it makes it keep track of its trainable
-# parameters, you can swap it between CPU and GPU with the .cuda() or
-# .cpu() functions, etc.
+# parameters, you can swap it between CPU and GPU with the ``.to(device)``
+# method, where device can be a CPU device ``torch.device("cpu")`` or CUDA
+# device ``torch.device("cuda:0")``.
 #
 # Let's write an annotated example of a network that takes in a sparse
 # bag-of-words representation and outputs a probability distribution over
@@ -211,7 +211,7 @@
 # Example: Logistic Regression Bag-of-Words classifier
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #
-# Our model will map a sparse BOW representation to log probabilities over
+# Our model will map a sparse BoW representation to log probabilities over
 # labels. We assign each word in the vocab an index. For example, say our
 # entire vocab is two words "hello" and "world", with indices 0 and 1
 # respectively. The BoW vector for the sentence "hello hello hello hello"
@@ -277,7 +277,7 @@ def forward(self, bow_vec):
         # Pass the input through the linear layer,
         # then pass that through log_softmax.
         # Many non-linearities and other functions are in torch.nn.functional
-        return F.log_softmax(self.linear(bow_vec))
+        return F.log_softmax(self.linear(bow_vec), dim=1)
 
 
 def make_bow_vector(sentence, word_to_ix):
@@ -297,16 +297,18 @@ def make_target(label, label_to_ix):
 # Whenever you assign a component to a class variable in the __init__ function
 # of a module, which was done with the line
 # self.linear = nn.Linear(...)
-# Then through some Python magic from the Pytorch devs, your module
+# Then through some Python magic from the PyTorch devs, your module
 # (in this case, BoWClassifier) will store knowledge of the nn.Linear's parameters
 for param in model.parameters():
     print(param)
 
-# To run the model, pass in a BoW vector, but wrapped in an autograd.Variable
-sample = data[0]
-bow_vector = make_bow_vector(sample[0], word_to_ix)
-log_probs = model(autograd.Variable(bow_vector))
-print(log_probs)
+# To run the model, pass in a BoW vector
+# Here we don't need to train, so the code is wrapped in torch.no_grad()
+with torch.no_grad():
+    sample = data[0]
+    bow_vector = make_bow_vector(sample[0], word_to_ix)
+    log_probs = model(bow_vector)
+    print(log_probs)
 
 
 ######################################################################
@@ -334,10 +336,11 @@ def make_target(label, label_to_ix):
 #
 
 # Run on test data before we train, just to see a before-and-after
-for instance, label in test_data:
-    bow_vec = autograd.Variable(make_bow_vector(instance, word_to_ix))
-    log_probs = model(bow_vec)
-    print(log_probs)
+with torch.no_grad():
+    for instance, label in test_data:
+        bow_vec = make_bow_vector(instance, word_to_ix)
+        log_probs = model(bow_vec)
+        print(log_probs)
 
 # Print the matrix column corresponding to "creo"
 print(next(model.parameters())[:, word_to_ix["creo"]])
@@ -350,17 +353,17 @@ def make_target(label, label_to_ix):
 # two instances.  Usually, somewhere between 5 and 30 epochs is reasonable.
 for epoch in range(100):
     for instance, label in data:
-        # Step 1. Remember that Pytorch accumulates gradients.
+        # Step 1. Remember that PyTorch accumulates gradients.
         # We need to clear them out before each instance
         model.zero_grad()
 
         # Step 2. Make our BOW vector and also we must wrap the target in a
-        # Variable as an integer. For example, if the target is SPANISH, then
+        # Tensor as an integer. For example, if the target is SPANISH, then
         # we wrap the integer 0. The loss function then knows that the 0th
         # element of the log probabilities is the log probability
         # corresponding to SPANISH
-        bow_vec = autograd.Variable(make_bow_vector(instance, word_to_ix))
-        target = autograd.Variable(make_target(label, label_to_ix))
+        bow_vec = make_bow_vector(instance, word_to_ix)
+        target = make_target(label, label_to_ix)
 
         # Step 3. Run our forward pass.
         log_probs = model(bow_vec)
@@ -371,10 +374,11 @@ def make_target(label, label_to_ix):
         loss.backward()
         optimizer.step()
 
-for instance, label in test_data:
-    bow_vec = autograd.Variable(make_bow_vector(instance, word_to_ix))
-    log_probs = model(bow_vec)
-    print(log_probs)
+with torch.no_grad():
+    for instance, label in test_data:
+        bow_vec = make_bow_vector(instance, word_to_ix)
+        log_probs = model(bow_vec)
+        print(log_probs)
 
 # Index corresponding to Spanish goes up, English goes down!
 print(next(model.parameters())[:, word_to_ix["creo"]])
@@ -385,7 +389,7 @@ def make_target(label, label_to_ix):
 # Spanish is much higher in the first example, and the log probability for
 # English is much higher in the second for the test data, as it should be.
 #
-# Now you see how to make a Pytorch component, pass some data through it
+# Now you see how to make a PyTorch component, pass some data through it
 # and do gradient updates. We are ready to dig deeper into what deep NLP
 # has to offer.
 #
diff --git a/beginner_source/nlp/pytorch_tutorial.py b/beginner_source/nlp/pytorch_tutorial.py
index 856a6f445ab..4dd966bc813 100644
--- a/beginner_source/nlp/pytorch_tutorial.py
+++ b/beginner_source/nlp/pytorch_tutorial.py
@@ -9,15 +9,11 @@
 All of deep learning is computations on tensors, which are
 generalizations of a matrix that can be indexed in more than 2
 dimensions. We will see exactly what this means in-depth later. First,
-lets look what we can do with tensors.
+let's look what we can do with tensors.
 """
 # Author: Robert Guthrie
 
 import torch
-import torch.autograd as autograd
-import torch.nn as nn
-import torch.nn.functional as F
-import torch.optim as optim
 
 torch.manual_seed(1)
 
@@ -26,24 +22,24 @@
 # Creating Tensors
 # ~~~~~~~~~~~~~~~~
 #
-# Tensors can be created from Python lists with the torch.Tensor()
+# Tensors can be created from Python lists with the torch.tensor()
 # function.
 #
 
-# Create a torch.Tensor object with the given data.  It is a 1D vector
+# torch.tensor(data) creates a torch.Tensor object with the given data.
 V_data = [1., 2., 3.]
-V = torch.Tensor(V_data)
+V = torch.tensor(V_data)
 print(V)
 
 # Creates a matrix
 M_data = [[1., 2., 3.], [4., 5., 6]]
-M = torch.Tensor(M_data)
+M = torch.tensor(M_data)
 print(M)
 
 # Create a 3D tensor of size 2x2x2.
 T_data = [[[1., 2.], [3., 4.]],
           [[5., 6.], [7., 8.]]]
-T = torch.Tensor(T_data)
+T = torch.tensor(T_data)
 print(T)
 
 
@@ -56,12 +52,14 @@
 # A note on terminology:
 # when I say "tensor" in this tutorial, it refers
 # to any torch.Tensor object. Matrices and vectors are special cases of
-# torch.Tensors, where their dimension is 1 and 2 respectively. When I am
+# torch.Tensors, where their dimension is 2 and 1 respectively. When I am
 # talking about 3D tensors, I will explicitly use the term "3D tensor".
 #
 
-# Index into V and get a scalar
+# Index into V and get a scalar (0 dimensional tensor)
 print(V[0])
+# Get a Python number from it
+print(V[0].item())
 
 # Index into M and get a vector
 print(M[0])
@@ -71,9 +69,10 @@
 
 
 ######################################################################
-# You can also create tensors of other datatypes. The default, as you can
-# see, is Float. To create a tensor of integer types, try
-# torch.LongTensor(). Check the documentation for more data types, but
+# You can also create tensors of other data types. To create a tensor of integer types, try
+# torch.tensor([[1, 2], [3, 4]]) (where all elements in the list are integers).
+# You can also specify a data type by passing in ``dtype=torch.data_type``.
+# Check the documentation for more data types, but
 # Float and Long will be the most common.
 #
 
@@ -93,14 +92,14 @@
 #
 # You can operate on tensors in the ways you would expect.
 
-x = torch.Tensor([1., 2., 3.])
-y = torch.Tensor([4., 5., 6.])
+x = torch.tensor([1., 2., 3.])
+y = torch.tensor([4., 5., 6.])
 z = x + y
 print(z)
 
 
 ######################################################################
-# See `the documentation <http://pytorch.org/docs/torch.html>`__ for a
+# See `the documentation <https://pytorch.org/docs/torch.html>`__ for a
 # complete list of the massive number of operations available to you. They
 # expand beyond just mathematical operations.
 #
@@ -151,8 +150,8 @@
 # specification of how your data is combined to give you the output. Since
 # the graph totally specifies what parameters were involved with which
 # operations, it contains enough information to compute derivatives. This
-# probably sounds vague, so lets see what is going on using the
-# fundamental class of Pytorch: autograd.Variable.
+# probably sounds vague, so let's see what is going on using the
+# fundamental flag ``requires_grad``.
 #
 # First, think from a programmers perspective. What is stored in the
 # torch.Tensor objects we were creating above? Obviously the data and the
@@ -162,26 +161,25 @@
 # (it could have been read in from a file, it could be the result of some
 # other operation, etc.)
 #
-# The Variable class keeps track of how it was created. Lets see it in
-# action.
+# If ``requires_grad=True``, the Tensor object keeps track of how it was
+# created. Let's see it in action.
 #
 
-# Variables wrap tensor objects
-x = autograd.Variable(torch.Tensor([1., 2., 3]), requires_grad=True)
-# You can access the data with the .data attribute
-print(x.data)
+# Tensor factory methods have a ``requires_grad`` flag
+x = torch.tensor([1., 2., 3], requires_grad=True)
 
-# You can also do all the same operations you did with tensors with Variables.
-y = autograd.Variable(torch.Tensor([4., 5., 6]), requires_grad=True)
+# With requires_grad=True, you can still do all the operations you previously
+# could
+y = torch.tensor([4., 5., 6], requires_grad=True)
 z = x + y
-print(z.data)
+print(z)
 
 # BUT z knows something extra.
 print(z.grad_fn)
 
 
 ######################################################################
-# So Variables know what created them. z knows that it wasn't read in from
+# So Tensors know what created them. z knows that it wasn't read in from
 # a file, it wasn't the result of a multiplication or exponential or
 # whatever. And if you keep following z.grad_fn, you will find yourself at
 # x and y.
@@ -189,7 +187,7 @@
 # But how does that help us compute a gradient?
 #
 
-# Lets sum up all the entries in z
+# Let's sum up all the entries in z
 s = z.sum()
 print(s)
 print(s.grad_fn)
@@ -224,7 +222,7 @@
 
 
 ######################################################################
-# Lets have Pytorch compute the gradient, and see that we were right:
+# Let's have Pytorch compute the gradient, and see that we were right:
 # (note if you run this block multiple times, the gradient will increment.
 # That is because Pytorch *accumulates* the gradient into the .grad
 # property, since for many models this is very convenient.)
@@ -240,42 +238,44 @@
 # successful programmer in deep learning.
 #
 
-x = torch.randn((2, 2))
-y = torch.randn((2, 2))
-z = x + y  # These are Tensor types, and backprop would not be possible
+x = torch.randn(2, 2)
+y = torch.randn(2, 2)
+# By default, user created Tensors have ``requires_grad=False``
+print(x.requires_grad, y.requires_grad)
+z = x + y
+# So you can't backprop through z
+print(z.grad_fn)
 
-var_x = autograd.Variable(x)
-var_y = autograd.Variable(y)
-# var_z contains enough information to compute gradients, as we saw above
-var_z = var_x + var_y
-print(var_z.grad_fn)
+# ``.requires_grad_( ... )`` changes an existing Tensor's ``requires_grad``
+# flag in-place. The input flag defaults to ``True`` if not given.
+x = x.requires_grad_()
+y = y.requires_grad_()
+# z contains enough information to compute gradients, as we saw above
+z = x + y
+print(z.grad_fn)
+# If any input to an operation has ``requires_grad=True``, so will the output
+print(z.requires_grad)
 
-var_z_data = var_z.data  # Get the wrapped Tensor object out of var_z...
-# Re-wrap the tensor in a new variable
-new_var_z = autograd.Variable(var_z_data)
+# Now z has the computation history that relates itself to x and y
+# Can we just take its values, and **detach** it from its history?
+new_z = z.detach()
 
-# ... does new_var_z have information to backprop to x and y?
+# ... does new_z have information to backprop to x and y?
 # NO!
-print(new_var_z.grad_fn)
-# And how could it?  We yanked the tensor out of var_z (that is
-# what var_z.data is).  This tensor doesn't know anything about
-# how it was computed.  We pass it into new_var_z, and this is all the
-# information new_var_z gets.  If var_z_data doesn't know how it was
-# computed, theres no way new_var_z will.
-# In essence, we have broken the variable away from its past history
+print(new_z.grad_fn)
+# And how could it? ``z.detach()`` returns a tensor that shares the same storage
+# as ``z``, but with the computation history forgotten. It doesn't know anything
+# about how it was computed.
+# In essence, we have broken the Tensor away from its past history
+
+###############################################################
+# You can also stop autograd from tracking history on Tensors
+# with ``.requires_grad=True`` by wrapping the code block in
+# ``with torch.no_grad():``
+print(x.requires_grad)
+print((x ** 2).requires_grad)
+
+with torch.no_grad():
+	print((x ** 2).requires_grad)
 
 
-######################################################################
-# Here is the basic, extremely important rule for computing with
-# autograd.Variables (note this is more general than Pytorch. There is an
-# equivalent object in every major deep learning toolkit):
-#
-# **If you want the error from your loss function to backpropogate to a
-# component of your network, you MUST NOT break the Variable chain from
-# that component to your loss Variable. If you do, the loss will have no
-# idea your component exists, and its parameters can't be updated.**
-#
-# I say this in bold, because this error can creep up on you in very
-# subtle ways (I will show some such ways below), and it will not cause
-# your code to crash or complain, so you must be careful.
-#
diff --git a/beginner_source/nlp/sequence_models_tutorial.py b/beginner_source/nlp/sequence_models_tutorial.py
index 25003d2e5f8..eec811d67a1 100644
--- a/beginner_source/nlp/sequence_models_tutorial.py
+++ b/beginner_source/nlp/sequence_models_tutorial.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 r"""
-Sequence Models and Long-Short Term Memory Networks
+Sequence Models and Long Short-Term Memory Networks
 ===================================================
 
 At this point, we have seen various feed-forward networks. That is,
@@ -13,7 +13,7 @@
 
 A recurrent neural network is a network that maintains some kind of
 state. For example, its output could be used as part of the next input,
-so that information can propogate along as the network passes over the
+so that information can propagate along as the network passes over the
 sequence. In the case of an LSTM, for each element in the sequence,
 there is a corresponding *hidden state* :math:`h_t`, which in principle
 can contain information from arbitrary points earlier in the sequence.
@@ -21,14 +21,14 @@
 part-of-speech tags, and a myriad of other things.
 
 
-LSTM's in Pytorch
+LSTMs in Pytorch
 ~~~~~~~~~~~~~~~~~
 
 Before getting to the example, note a few things. Pytorch's LSTM expects
 all of its inputs to be 3D tensors. The semantics of the axes of these
 tensors is important. The first axis is the sequence itself, the second
 indexes instances in the mini-batch, and the third indexes elements of
-the input. We haven't discussed mini-batching, so lets just ignore that
+the input. We haven't discussed mini-batching, so let's just ignore that
 and assume we will always have just 1 dimension on the second axis. If
 we want to run the sequence model over the sentence "The cow jumped",
 our input should look like
@@ -53,7 +53,6 @@
 # Author: Robert Guthrie
 
 import torch
-import torch.autograd as autograd
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.optim as optim
@@ -63,12 +62,11 @@
 ######################################################################
 
 lstm = nn.LSTM(3, 3)  # Input dim is 3, output dim is 3
-inputs = [autograd.Variable(torch.randn((1, 3)))
-          for _ in range(5)]  # make a sequence of length 5
+inputs = [torch.randn(1, 3) for _ in range(5)]  # make a sequence of length 5
 
 # initialize the hidden state.
-hidden = (autograd.Variable(torch.randn(1, 1, 3)),
-          autograd.Variable(torch.randn((1, 1, 3))))
+hidden = (torch.randn(1, 1, 3),
+          torch.randn(1, 1, 3))
 for i in inputs:
     # Step through the sequence one element at a time.
     # after each step, hidden contains the hidden state.
@@ -80,12 +78,11 @@
 # (compare the last slice of "out" with "hidden" below, they are the same)
 # The reason for this is that:
 # "out" will give you access to all hidden states in the sequence
-# "hidden" will allow you to continue the sequence and backpropogate,
+# "hidden" will allow you to continue the sequence and backpropagate,
 # by passing it as an argument  to the lstm at a later time
 # Add the extra 2nd dimension
 inputs = torch.cat(inputs).view(len(inputs), 1, -1)
-hidden = (autograd.Variable(torch.randn(1, 1, 3)), autograd.Variable(
-    torch.randn((1, 1, 3))))  # clean out hidden state
+hidden = (torch.randn(1, 1, 3), torch.randn(1, 1, 3))  # clean out hidden state
 out, hidden = lstm(inputs, hidden)
 print(out)
 print(hidden)
@@ -98,7 +95,9 @@
 # In this section, we will use an LSTM to get part of speech tags. We will
 # not use Viterbi or Forward-Backward or anything like that, but as a
 # (challenging) exercise to the reader, think about how Viterbi could be
-# used after you have seen what is going on.
+# used after you have seen what is going on. In this example, we also refer
+# to embeddings. If you are unfamiliar with embeddings, you can read up 
+# about them `here <https://pytorch.org/tutorials/beginner/nlp/word_embeddings_tutorial.html>`__.
 #
 # The model is as follows: let our input sentence be
 # :math:`w_1, \dots, w_M`, where :math:`w_i \in V`, our vocab. Also, let
@@ -126,21 +125,23 @@
 
 def prepare_sequence(seq, to_ix):
     idxs = [to_ix[w] for w in seq]
-    tensor = torch.LongTensor(idxs)
-    return autograd.Variable(tensor)
+    return torch.tensor(idxs, dtype=torch.long)
 
 
 training_data = [
+    # Tags are: DET - determiner; NN - noun; V - verb
+    # For example, the word "The" is a determiner 
     ("The dog ate the apple".split(), ["DET", "NN", "V", "DET", "NN"]),
     ("Everybody read that book".split(), ["NN", "V", "DET", "NN"])
 ]
 word_to_ix = {}
+# For each words-list (sentence) and tags-list in each tuple of training_data
 for sent, tags in training_data:
     for word in sent:
-        if word not in word_to_ix:
-            word_to_ix[word] = len(word_to_ix)
+        if word not in word_to_ix:  # word has not been assigned an index yet
+            word_to_ix[word] = len(word_to_ix)  # Assign each word with a unique index
 print(word_to_ix)
-tag_to_ix = {"DET": 0, "NN": 1, "V": 2}
+tag_to_ix = {"DET": 0, "NN": 1, "V": 2}  # Assign each tag with a unique index
 
 # These will usually be more like 32 or 64 dimensional.
 # We will keep them small, so we can see how the weights change as we train.
@@ -165,22 +166,12 @@ def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
 
         # The linear layer that maps from hidden state space to tag space
         self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
-        self.hidden = self.init_hidden()
-
-    def init_hidden(self):
-        # Before we've done anything, we dont have any hidden state.
-        # Refer to the Pytorch documentation to see exactly
-        # why they have this dimensionality.
-        # The axes semantics are (num_layers, minibatch_size, hidden_dim)
-        return (autograd.Variable(torch.zeros(1, 1, self.hidden_dim)),
-                autograd.Variable(torch.zeros(1, 1, self.hidden_dim)))
 
     def forward(self, sentence):
         embeds = self.word_embeddings(sentence)
-        lstm_out, self.hidden = self.lstm(
-            embeds.view(len(sentence), 1, -1), self.hidden)
+        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
         tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
-        tag_scores = F.log_softmax(tag_space)
+        tag_scores = F.log_softmax(tag_space, dim=1)
         return tag_scores
 
 ######################################################################
@@ -193,9 +184,11 @@ def forward(self, sentence):
 
 # See what the scores are before training
 # Note that element i,j of the output is the score for tag j for word i.
-inputs = prepare_sequence(training_data[0][0], word_to_ix)
-tag_scores = model(inputs)
-print(tag_scores)
+# Here we don't need to train, so the code is wrapped in torch.no_grad()
+with torch.no_grad():
+    inputs = prepare_sequence(training_data[0][0], word_to_ix)
+    tag_scores = model(inputs)
+    print(tag_scores)
 
 for epoch in range(300):  # again, normally you would NOT do 300 epochs, it is toy data
     for sentence, tags in training_data:
@@ -203,12 +196,8 @@ def forward(self, sentence):
         # We need to clear them out before each instance
         model.zero_grad()
 
-        # Also, we need to clear out the hidden state of the LSTM,
-        # detaching it from its history on the last instance.
-        model.hidden = model.init_hidden()
-
         # Step 2. Get our inputs ready for the network, that is, turn them into
-        # Variables of word indices.
+        # Tensors of word indices.
         sentence_in = prepare_sequence(sentence, word_to_ix)
         targets = prepare_sequence(tags, tag_to_ix)
 
@@ -222,15 +211,17 @@ def forward(self, sentence):
         optimizer.step()
 
 # See what the scores are after training
-inputs = prepare_sequence(training_data[0][0], word_to_ix)
-tag_scores = model(inputs)
-# The sentence is "the dog ate the apple".  i,j corresponds to score for tag j
-#  for word i. The predicted tag is the maximum scoring tag.
-# Here, we can see the predicted sequence below is 0 1 2 0 1
-# since 0 is index of the maximum value of row 1,
-# 1 is the index of maximum value of row 2, etc.
-# Which is DET NOUN VERB DET NOUN, the correct sequence!
-print(tag_scores)
+with torch.no_grad():
+    inputs = prepare_sequence(training_data[0][0], word_to_ix)
+    tag_scores = model(inputs)
+
+    # The sentence is "the dog ate the apple".  i,j corresponds to score for tag j
+    # for word i. The predicted tag is the maximum scoring tag.
+    # Here, we can see the predicted sequence below is 0 1 2 0 1
+    # since 0 is index of the maximum value of row 1,
+    # 1 is the index of maximum value of row 2, etc.
+    # Which is DET NOUN VERB DET NOUN, the correct sequence!
+    print(tag_scores)
 
 
 ######################################################################
@@ -244,7 +235,7 @@ def forward(self, sentence):
 # affixes have a large bearing on part-of-speech. For example, words with
 # the affix *-ly* are almost always tagged as adverbs in English.
 #
-# Do do this, let :math:`c_w` be the character-level representation of
+# To do this, let :math:`c_w` be the character-level representation of
 # word :math:`w`. Let :math:`x_w` be the word embedding as before. Then
 # the input to our sequence model is the concatenation of :math:`x_w` and
 # :math:`c_w`. So if :math:`x_w` has dimension 5, and :math:`c_w`
diff --git a/beginner_source/nlp/word_embeddings_tutorial.py b/beginner_source/nlp/word_embeddings_tutorial.py
index 43705e78c92..194b0a60f32 100644
--- a/beginner_source/nlp/word_embeddings_tutorial.py
+++ b/beginner_source/nlp/word_embeddings_tutorial.py
@@ -93,7 +93,7 @@
 .. math::
 
     \text{Similarity}(\text{physicist}, \text{mathematician}) = \frac{q_\text{physicist} \cdot q_\text{mathematician}}
-   {\| q_\text{\physicist} \| \| q_\text{mathematician} \|} = \cos (\phi)
+   {\| q_\text{physicist} \| \| q_\text{mathematician} \|} = \cos (\phi)
 
 Where :math:`\phi` is the angle between the two vectors. That way,
 extremely similar words (words whose embeddings point in the same
@@ -120,7 +120,7 @@
 although with our hand-crafted vectors above we can see that
 mathematicians and physicists are similar in that they both like coffee,
 if we allow a neural network to learn the embeddings and see that both
-mathematicians and physicisits have a large value in the second
+mathematicians and physicists have a large value in the second
 dimension, it is not clear what that means. They are similar in some
 latent semantic dimension, but this probably has no interpretation to
 us.
@@ -159,7 +159,6 @@
 # Author: Robert Guthrie
 
 import torch
-import torch.autograd as autograd
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.optim as optim
@@ -170,8 +169,8 @@
 
 word_to_ix = {"hello": 0, "world": 1}
 embeds = nn.Embedding(2, 5)  # 2 words in vocab, 5 dimensional embeddings
-lookup_tensor = torch.LongTensor([word_to_ix["hello"]])
-hello_embed = embeds(autograd.Variable(lookup_tensor))
+lookup_tensor = torch.tensor([word_to_ix["hello"]], dtype=torch.long)
+hello_embed = embeds(lookup_tensor)
 print(hello_embed)
 
 
@@ -208,11 +207,17 @@
 This were to be new made when thou art old,
 And see thy blood warm when thou feel'st it cold.""".split()
 # we should tokenize the input, but we will ignore that for now
-# build a list of tuples.  Each tuple is ([ word_i-2, word_i-1 ], target word)
-trigrams = [([test_sentence[i], test_sentence[i + 1]], test_sentence[i + 2])
-            for i in range(len(test_sentence) - 2)]
-# print the first 3, just so you can see what they look like
-print(trigrams[:3])
+# build a list of tuples.
+# Each tuple is ([ word_i-CONTEXT_SIZE, ..., word_i-1 ], target word)
+ngrams = [
+    (
+        [test_sentence[i - j - 1] for j in range(CONTEXT_SIZE)],
+        test_sentence[i]
+    )
+    for i in range(CONTEXT_SIZE, len(test_sentence))
+]
+# Print the first 3, just so you can see what they look like.
+print(ngrams[:3])
 
 vocab = set(test_sentence)
 word_to_ix = {word: i for i, word in enumerate(vocab)}
@@ -230,7 +235,7 @@ def forward(self, inputs):
         embeds = self.embeddings(inputs).view((1, -1))
         out = F.relu(self.linear1(embeds))
         out = self.linear2(out)
-        log_probs = F.log_softmax(out)
+        log_probs = F.log_softmax(out, dim=1)
         return log_probs
 
 
@@ -240,13 +245,12 @@ def forward(self, inputs):
 optimizer = optim.SGD(model.parameters(), lr=0.001)
 
 for epoch in range(10):
-    total_loss = torch.Tensor([0])
-    for context, target in trigrams:
+    total_loss = 0
+    for context, target in ngrams:
 
         # Step 1. Prepare the inputs to be passed to the model (i.e, turn the words
-        # into integer indices and wrap them in variables)
-        context_idxs = [word_to_ix[w] for w in context]
-        context_var = autograd.Variable(torch.LongTensor(context_idxs))
+        # into integer indices and wrap them in tensors)
+        context_idxs = torch.tensor([word_to_ix[w] for w in context], dtype=torch.long)
 
         # Step 2. Recall that torch *accumulates* gradients. Before passing in a
         # new instance, you need to zero out the gradients from the old
@@ -255,21 +259,23 @@ def forward(self, inputs):
 
         # Step 3. Run the forward pass, getting log probabilities over next
         # words
-        log_probs = model(context_var)
+        log_probs = model(context_idxs)
 
         # Step 4. Compute your loss function. (Again, Torch wants the target
-        # word wrapped in a variable)
-        loss = loss_function(log_probs, autograd.Variable(
-            torch.LongTensor([word_to_ix[target]])))
+        # word wrapped in a tensor)
+        loss = loss_function(log_probs, torch.tensor([word_to_ix[target]], dtype=torch.long))
 
         # Step 5. Do the backward pass and update the gradient
         loss.backward()
         optimizer.step()
 
-        total_loss += loss.data
+        # Get the Python number from a 1-element Tensor by calling tensor.item()
+        total_loss += loss.item()
     losses.append(total_loss)
 print(losses)  # The loss decreased every iteration over the training data!
 
+# To get the embedding of a particular word, e.g. "beauty"
+print(model.embeddings.weight[word_to_ix["beauty"]])
 
 ######################################################################
 # Exercise: Computing Word Embeddings: Continuous Bag-of-Words
@@ -279,7 +285,7 @@ def forward(self, inputs):
 # learning. It is a model that tries to predict words given the context of
 # a few words before and a few words after the target word. This is
 # distinct from language modeling, since CBOW is not sequential and does
-# not have to be probabilistic. Typcially, CBOW is used to quickly train
+# not have to be probabilistic. Typically, CBOW is used to quickly train
 # word embeddings, and these embeddings are used to initialize the
 # embeddings of some more complicated model. Usually, this is referred to
 # as *pretraining embeddings*. It almost always helps performance a couple
@@ -290,7 +296,7 @@ def forward(self, inputs):
 # and :math:`w_{i+1}, \dots, w_{i+N}`, referring to all context words
 # collectively as :math:`C`, CBOW tries to minimize
 #
-# .. math::  -\log p(w_i | C) = -\log \text{Softmax}(A(\sum_{w \in C} q_w) + b)
+# .. math::  -\log p(w_i | C) = -\log \text{Softmax}\left(A(\sum_{w \in C} q_w) + b\right)
 #
 # where :math:`q_w` is the embedding for word :math:`w`.
 #
@@ -316,9 +322,11 @@ def forward(self, inputs):
 
 word_to_ix = {word: i for i, word in enumerate(vocab)}
 data = []
-for i in range(2, len(raw_text) - 2):
-    context = [raw_text[i - 2], raw_text[i - 1],
-               raw_text[i + 1], raw_text[i + 2]]
+for i in range(CONTEXT_SIZE, len(raw_text) - CONTEXT_SIZE):
+    context = (
+        [raw_text[i - j - 1] for j in range(CONTEXT_SIZE)]
+        + [raw_text[i + j + 1] for j in range(CONTEXT_SIZE)]
+    )
     target = raw_text[i]
     data.append((context, target))
 print(data[:5])
@@ -332,14 +340,13 @@ def __init__(self):
     def forward(self, inputs):
         pass
 
-# create your model and train.  here are some functions to help you make
-# the data ready for use by your module
+# Create your model and train. Here are some functions to help you make
+# the data ready for use by your module.
 
 
 def make_context_vector(context, word_to_ix):
     idxs = [word_to_ix[w] for w in context]
-    tensor = torch.LongTensor(idxs)
-    return autograd.Variable(tensor)
+    return torch.tensor(idxs, dtype=torch.long)
 
 
 make_context_vector(data[0][0], word_to_ix)  # example
diff --git a/beginner_source/nn_tutorial.py b/beginner_source/nn_tutorial.py
new file mode 100644
index 00000000000..e04815bd27e
--- /dev/null
+++ b/beginner_source/nn_tutorial.py
@@ -0,0 +1,898 @@
+# -*- coding: utf-8 -*-
+"""
+What is `torch.nn` *really*?
+============================
+
+**Authors:** Jeremy Howard, `fast.ai <https://www.fast.ai>`_. Thanks to Rachel Thomas and Francisco Ingham.
+"""
+
+###############################################################################
+# We recommend running this tutorial as a notebook, not a script. To download the notebook (``.ipynb``) file,
+# click the link at the top of the page.
+#
+# PyTorch provides the elegantly designed modules and classes `torch.nn <https://pytorch.org/docs/stable/nn.html>`_ ,
+# `torch.optim <https://pytorch.org/docs/stable/optim.html>`_ ,
+# `Dataset <https://pytorch.org/docs/stable/data.html?highlight=dataset#torch.utils.data.Dataset>`_ ,
+# and `DataLoader <https://pytorch.org/docs/stable/data.html?highlight=dataloader#torch.utils.data.DataLoader>`_
+# to help you create and train neural networks.
+# In order to fully utilize their power and customize
+# them for your problem, you need to really understand exactly what they're
+# doing. To develop this understanding, we will first train basic neural net
+# on the MNIST data set without using any features from these models; we will
+# initially only use the most basic PyTorch tensor functionality. Then, we will
+# incrementally add one feature from ``torch.nn``, ``torch.optim``, ``Dataset``, or
+# ``DataLoader`` at a time, showing exactly what each piece does, and how it
+# works to make the code either more concise, or more flexible.
+#
+# **This tutorial assumes you already have PyTorch installed, and are familiar
+# with the basics of tensor operations.** (If you're familiar with Numpy array
+# operations, you'll find the PyTorch tensor operations used here nearly identical).
+#
+# MNIST data setup
+# ----------------
+#
+# We will use the classic `MNIST <https://yann.lecun.com/exdb/mnist/index.html>`_ dataset,
+# which consists of black-and-white images of hand-drawn digits (between 0 and 9).
+#
+# We will use `pathlib <https://docs.python.org/3/library/pathlib.html>`_
+# for dealing with paths (part of the Python 3 standard library), and will
+# download the dataset using
+# `requests <http://docs.python-requests.org/en/master/>`_. We will only
+# import modules when we use them, so you can see exactly what's being
+# used at each point.
+
+from pathlib import Path
+import requests
+
+DATA_PATH = Path("data")
+PATH = DATA_PATH / "mnist"
+
+PATH.mkdir(parents=True, exist_ok=True)
+
+URL = "https://github.com/pytorch/tutorials/raw/main/_static/"
+FILENAME = "mnist.pkl.gz"
+
+if not (PATH / FILENAME).exists():
+        content = requests.get(URL + FILENAME).content
+        (PATH / FILENAME).open("wb").write(content)
+
+###############################################################################
+# This dataset is in numpy array format, and has been stored using pickle,
+# a python-specific format for serializing data.
+
+import pickle
+import gzip
+
+with gzip.open((PATH / FILENAME).as_posix(), "rb") as f:
+        ((x_train, y_train), (x_valid, y_valid), _) = pickle.load(f, encoding="latin-1")
+
+###############################################################################
+# Each image is 28 x 28, and is being stored as a flattened row of length
+# 784 (=28x28). Let's take a look at one; we need to reshape it to 2d
+# first.
+
+from matplotlib import pyplot
+import numpy as np
+
+pyplot.imshow(x_train[0].reshape((28, 28)), cmap="gray")
+# ``pyplot.show()`` only if not on Colab
+try:
+    import google.colab
+except ImportError:
+    pyplot.show()
+print(x_train.shape)
+
+###############################################################################
+# PyTorch uses ``torch.tensor``, rather than numpy arrays, so we need to
+# convert our data.
+
+import torch
+
+x_train, y_train, x_valid, y_valid = map(
+    torch.tensor, (x_train, y_train, x_valid, y_valid)
+)
+n, c = x_train.shape
+print(x_train, y_train)
+print(x_train.shape)
+print(y_train.min(), y_train.max())
+
+###############################################################################
+# Neural net from scratch (without ``torch.nn``)
+# -----------------------------------------------
+#
+# Let's first create a model using nothing but PyTorch tensor operations. We're assuming
+# you're already familiar with the basics of neural networks. (If you're not, you can
+# learn them at `course.fast.ai <https://course.fast.ai>`_).
+#
+# PyTorch provides methods to create random or zero-filled tensors, which we will
+# use to create our weights and bias for a simple linear model. These are just regular
+# tensors, with one very special addition: we tell PyTorch that they require a
+# gradient. This causes PyTorch to record all of the operations done on the tensor,
+# so that it can calculate the gradient during back-propagation *automatically*!
+#
+# For the weights, we set ``requires_grad`` **after** the initialization, since we
+# don't want that step included in the gradient. (Note that a trailing ``_`` in
+# PyTorch signifies that the operation is performed in-place.)
+#
+# .. note:: We are initializing the weights here with
+#    `Xavier initialisation <http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf>`_
+#    (by multiplying with ``1/sqrt(n)``).
+
+import math
+
+weights = torch.randn(784, 10) / math.sqrt(784)
+weights.requires_grad_()
+bias = torch.zeros(10, requires_grad=True)
+
+###############################################################################
+# Thanks to PyTorch's ability to calculate gradients automatically, we can
+# use any standard Python function (or callable object) as a model! So
+# let's just write a plain matrix multiplication and broadcasted addition
+# to create a simple linear model. We also need an activation function, so
+# we'll write `log_softmax` and use it. Remember: although PyTorch
+# provides lots of prewritten loss functions, activation functions, and
+# so forth, you can easily write your own using plain python. PyTorch will
+# even create fast accelerator or vectorized CPU code for your function
+# automatically.
+
+def log_softmax(x):
+    return x - x.exp().sum(-1).log().unsqueeze(-1)
+
+def model(xb):
+    return log_softmax(xb @ weights + bias)
+
+######################################################################################
+# In the above, the ``@`` stands for the matrix multiplication operation. We will call
+# our function on one batch of data (in this case, 64 images).  This is
+# one *forward pass*.  Note that our predictions won't be any better than
+# random at this stage, since we start with random weights.
+
+bs = 64  # batch size
+
+xb = x_train[0:bs]  # a mini-batch from x
+preds = model(xb)  # predictions
+preds[0], preds.shape
+print(preds[0], preds.shape)
+
+###############################################################################
+# As you see, the ``preds`` tensor contains not only the tensor values, but also a
+# gradient function. We'll use this later to do backprop.
+#
+# Let's implement negative log-likelihood to use as the loss function
+# (again, we can just use standard Python):
+
+
+def nll(input, target):
+    return -input[range(target.shape[0]), target].mean()
+
+loss_func = nll
+
+###############################################################################
+# Let's check our loss with our random model, so we can see if we improve
+# after a backprop pass later.
+
+yb = y_train[0:bs]
+print(loss_func(preds, yb))
+
+
+###############################################################################
+# Let's also implement a function to calculate the accuracy of our model.
+# For each prediction, if the index with the largest value matches the
+# target value, then the prediction was correct.
+
+def accuracy(out, yb):
+    preds = torch.argmax(out, dim=1)
+    return (preds == yb).float().mean()
+
+###############################################################################
+# Let's check the accuracy of our random model, so we can see if our
+# accuracy improves as our loss improves.
+
+print(accuracy(preds, yb))
+
+###############################################################################
+# We can now run a training loop.  For each iteration, we will:
+#
+# - select a mini-batch of data (of size ``bs``)
+# - use the model to make predictions
+# - calculate the loss
+# - ``loss.backward()`` updates the gradients of the model, in this case, ``weights``
+#   and ``bias``.
+#
+# We now use these gradients to update the weights and bias.  We do this
+# within the ``torch.no_grad()`` context manager, because we do not want these
+# actions to be recorded for our next calculation of the gradient.  You can read
+# more about how PyTorch's Autograd records operations
+# `here <https://pytorch.org/docs/stable/notes/autograd.html>`_.
+#
+# We then set the
+# gradients to zero, so that we are ready for the next loop.
+# Otherwise, our gradients would record a running tally of all the operations
+# that had happened (i.e. ``loss.backward()`` *adds* the gradients to whatever is
+# already stored, rather than replacing them).
+#
+# .. tip:: You can use the standard python debugger to step through PyTorch
+#    code, allowing you to check the various variable values at each step.
+#    Uncomment ``set_trace()`` below to try it out.
+#
+
+from IPython.core.debugger import set_trace
+
+lr = 0.5  # learning rate
+epochs = 2  # how many epochs to train for
+
+for epoch in range(epochs):
+    for i in range((n - 1) // bs + 1):
+        #         set_trace()
+        start_i = i * bs
+        end_i = start_i + bs
+        xb = x_train[start_i:end_i]
+        yb = y_train[start_i:end_i]
+        pred = model(xb)
+        loss = loss_func(pred, yb)
+
+        loss.backward()
+        with torch.no_grad():
+            weights -= weights.grad * lr
+            bias -= bias.grad * lr
+            weights.grad.zero_()
+            bias.grad.zero_()
+
+###############################################################################
+# That's it: we've created and trained a minimal neural network (in this case, a
+# logistic regression, since we have no hidden layers) entirely from scratch!
+#
+# Let's check the loss and accuracy and compare those to what we got
+# earlier. We expect that the loss will have decreased and accuracy to
+# have increased, and they have.
+
+print(loss_func(model(xb), yb), accuracy(model(xb), yb))
+
+###############################################################################
+# Using ``torch.nn.functional``
+# ------------------------------
+#
+# We will now refactor our code, so that it does the same thing as before, only
+# we'll start taking advantage of PyTorch's ``nn`` classes to make it more concise
+# and flexible. At each step from here, we should be making our code one or more
+# of: shorter, more understandable, and/or more flexible.
+#
+# The first and easiest step is to make our code shorter by replacing our
+# hand-written activation and loss functions with those from ``torch.nn.functional``
+# (which is generally imported into the namespace ``F`` by convention). This module
+# contains all the functions in the ``torch.nn`` library (whereas other parts of the
+# library contain classes). As well as a wide range of loss and activation
+# functions, you'll also find here some convenient functions for creating neural
+# nets, such as pooling functions. (There are also functions for doing convolutions,
+# linear layers, etc, but as we'll see, these are usually better handled using
+# other parts of the library.)
+#
+# If you're using negative log likelihood loss and log softmax activation,
+# then Pytorch provides a single function ``F.cross_entropy`` that combines
+# the two. So we can even remove the activation function from our model.
+
+import torch.nn.functional as F
+
+loss_func = F.cross_entropy
+
+def model(xb):
+    return xb @ weights + bias
+
+###############################################################################
+# Note that we no longer call ``log_softmax`` in the ``model`` function. Let's
+# confirm that our loss and accuracy are the same as before:
+
+print(loss_func(model(xb), yb), accuracy(model(xb), yb))
+
+###############################################################################
+# Refactor using ``nn.Module``
+# -----------------------------
+# Next up, we'll use ``nn.Module`` and ``nn.Parameter``, for a clearer and more
+# concise training loop. We subclass ``nn.Module`` (which itself is a class and
+# able to keep track of state).  In this case, we want to create a class that
+# holds our weights, bias, and method for the forward step.  ``nn.Module`` has a
+# number of attributes and methods (such as ``.parameters()`` and ``.zero_grad()``)
+# which we will be using.
+#
+# .. note:: ``nn.Module`` (uppercase M) is a PyTorch specific concept, and is a
+#    class we'll be using a lot. ``nn.Module`` is not to be confused with the Python
+#    concept of a (lowercase ``m``) `module <https://docs.python.org/3/tutorial/modules.html>`_,
+#    which is a file of Python code that can be imported.
+
+from torch import nn
+
+class Mnist_Logistic(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.weights = nn.Parameter(torch.randn(784, 10) / math.sqrt(784))
+        self.bias = nn.Parameter(torch.zeros(10))
+
+    def forward(self, xb):
+        return xb @ self.weights + self.bias
+
+###############################################################################
+# Since we're now using an object instead of just using a function, we
+# first have to instantiate our model:
+
+model = Mnist_Logistic()
+
+###############################################################################
+# Now we can calculate the loss in the same way as before. Note that
+# ``nn.Module`` objects are used as if they are functions (i.e they are
+# *callable*), but behind the scenes Pytorch will call our ``forward``
+# method automatically.
+
+print(loss_func(model(xb), yb))
+
+###############################################################################
+# Previously for our training loop we had to update the values for each parameter
+# by name, and manually zero out the grads for each parameter separately, like this:
+#
+# .. code-block:: python
+#
+#    with torch.no_grad():
+#        weights -= weights.grad * lr
+#        bias -= bias.grad * lr
+#        weights.grad.zero_()
+#        bias.grad.zero_()
+#
+#
+# Now we can take advantage of model.parameters() and model.zero_grad() (which
+# are both defined by PyTorch for ``nn.Module``) to make those steps more concise
+# and less prone to the error of forgetting some of our parameters, particularly
+# if we had a more complicated model:
+#
+# .. code-block:: python
+#
+#    with torch.no_grad():
+#        for p in model.parameters(): p -= p.grad * lr
+#        model.zero_grad()
+#
+#
+# We'll wrap our little training loop in a ``fit`` function so we can run it
+# again later.
+
+def fit():
+    for epoch in range(epochs):
+        for i in range((n - 1) // bs + 1):
+            start_i = i * bs
+            end_i = start_i + bs
+            xb = x_train[start_i:end_i]
+            yb = y_train[start_i:end_i]
+            pred = model(xb)
+            loss = loss_func(pred, yb)
+
+            loss.backward()
+            with torch.no_grad():
+                for p in model.parameters():
+                    p -= p.grad * lr
+                model.zero_grad()
+
+fit()
+
+###############################################################################
+# Let's double-check that our loss has gone down:
+
+print(loss_func(model(xb), yb))
+
+###############################################################################
+# Refactor using ``nn.Linear``
+# ----------------------------
+#
+# We continue to refactor our code.  Instead of manually defining and
+# initializing ``self.weights`` and ``self.bias``, and calculating ``xb  @
+# self.weights + self.bias``, we will instead use the Pytorch class
+# `nn.Linear <https://pytorch.org/docs/stable/nn.html#linear-layers>`_ for a
+# linear layer, which does all that for us. Pytorch has many types of
+# predefined layers that can greatly simplify our code, and often makes it
+# faster too.
+
+class Mnist_Logistic(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.lin = nn.Linear(784, 10)
+
+    def forward(self, xb):
+        return self.lin(xb)
+
+###############################################################################
+# We instantiate our model and calculate the loss in the same way as before:
+
+model = Mnist_Logistic()
+print(loss_func(model(xb), yb))
+
+###############################################################################
+# We are still able to use our same ``fit`` method as before.
+
+fit()
+
+print(loss_func(model(xb), yb))
+
+###############################################################################
+# Refactor using ``torch.optim``
+# ------------------------------
+#
+# Pytorch also has a package with various optimization algorithms, ``torch.optim``.
+# We can use the ``step`` method from our optimizer to take a forward step, instead
+# of manually updating each parameter.
+#
+# This will let us replace our previous manually coded optimization step:
+#
+# .. code-block:: python
+#
+#    with torch.no_grad():
+#        for p in model.parameters(): p -= p.grad * lr
+#        model.zero_grad()
+#
+# and instead use just:
+#
+# .. code-block:: python
+#
+#    opt.step()
+#    opt.zero_grad()
+#
+# (``optim.zero_grad()`` resets the gradient to 0 and we need to call it before
+# computing the gradient for the next minibatch.)
+
+from torch import optim
+
+###############################################################################
+# We'll define a little function to create our model and optimizer so we
+# can reuse it in the future.
+
+def get_model():
+    model = Mnist_Logistic()
+    return model, optim.SGD(model.parameters(), lr=lr)
+
+model, opt = get_model()
+print(loss_func(model(xb), yb))
+
+for epoch in range(epochs):
+    for i in range((n - 1) // bs + 1):
+        start_i = i * bs
+        end_i = start_i + bs
+        xb = x_train[start_i:end_i]
+        yb = y_train[start_i:end_i]
+        pred = model(xb)
+        loss = loss_func(pred, yb)
+
+        loss.backward()
+        opt.step()
+        opt.zero_grad()
+
+print(loss_func(model(xb), yb))
+
+###############################################################################
+# Refactor using Dataset
+# ------------------------------
+#
+# PyTorch has an abstract Dataset class.  A Dataset can be anything that has
+# a ``__len__`` function (called by Python's standard ``len`` function) and
+# a ``__getitem__`` function as a way of indexing into it.
+# `This tutorial <https://pytorch.org/tutorials/beginner/data_loading_tutorial.html>`_
+# walks through a nice example of creating a custom ``FacialLandmarkDataset`` class
+# as a subclass of ``Dataset``.
+#
+# PyTorch's `TensorDataset <https://pytorch.org/docs/stable/_modules/torch/utils/data/dataset.html#TensorDataset>`_
+# is a Dataset wrapping tensors. By defining a length and way of indexing,
+# this also gives us a way to iterate, index, and slice along the first
+# dimension of a tensor. This will make it easier to access both the
+# independent and dependent variables in the same line as we train.
+
+from torch.utils.data import TensorDataset
+
+###############################################################################
+# Both ``x_train`` and ``y_train`` can be combined in a single ``TensorDataset``,
+# which will be easier to iterate over and slice.
+
+train_ds = TensorDataset(x_train, y_train)
+
+###############################################################################
+# Previously, we had to iterate through minibatches of ``x`` and ``y`` values separately:
+#
+# .. code-block:: python
+#
+#    xb = x_train[start_i:end_i]
+#    yb = y_train[start_i:end_i]
+#
+#
+# Now, we can do these two steps together:
+#
+# .. code-block:: python
+#
+#    xb,yb = train_ds[i*bs : i*bs+bs]
+#
+
+model, opt = get_model()
+
+for epoch in range(epochs):
+    for i in range((n - 1) // bs + 1):
+        xb, yb = train_ds[i * bs: i * bs + bs]
+        pred = model(xb)
+        loss = loss_func(pred, yb)
+
+        loss.backward()
+        opt.step()
+        opt.zero_grad()
+
+print(loss_func(model(xb), yb))
+
+###############################################################################
+# Refactor using ``DataLoader``
+# ------------------------------
+#
+# PyTorch's ``DataLoader`` is responsible for managing batches. You can
+# create a ``DataLoader`` from any ``Dataset``. ``DataLoader`` makes it easier
+# to iterate over batches. Rather than having to use ``train_ds[i*bs : i*bs+bs]``,
+# the ``DataLoader`` gives us each minibatch automatically.
+
+from torch.utils.data import DataLoader
+
+train_ds = TensorDataset(x_train, y_train)
+train_dl = DataLoader(train_ds, batch_size=bs)
+
+###############################################################################
+# Previously, our loop iterated over batches ``(xb, yb)`` like this:
+#
+# .. code-block:: python
+#
+#    for i in range((n-1)//bs + 1):
+#        xb,yb = train_ds[i*bs : i*bs+bs]
+#        pred = model(xb)
+#
+# Now, our loop is much cleaner, as ``(xb, yb)`` are loaded automatically from the data loader:
+#
+# .. code-block:: python
+#
+#    for xb,yb in train_dl:
+#        pred = model(xb)
+
+model, opt = get_model()
+
+for epoch in range(epochs):
+    for xb, yb in train_dl:
+        pred = model(xb)
+        loss = loss_func(pred, yb)
+
+        loss.backward()
+        opt.step()
+        opt.zero_grad()
+
+print(loss_func(model(xb), yb))
+
+###############################################################################
+# Thanks to PyTorch's ``nn.Module``, ``nn.Parameter``, ``Dataset``, and ``DataLoader``,
+# our training loop is now dramatically smaller and easier to understand. Let's
+# now try to add the basic features necessary to create effective models in practice.
+#
+# Add validation
+# -----------------------
+#
+# In section 1, we were just trying to get a reasonable training loop set up for
+# use on our training data.  In reality, you **always** should also have
+# a `validation set <https://www.fast.ai/2017/11/13/validation-sets/>`_, in order
+# to identify if you are overfitting.
+#
+# Shuffling the training data is
+# `important <https://www.quora.com/Does-the-order-of-training-data-matter-when-training-neural-networks>`_
+# to prevent correlation between batches and overfitting. On the other hand, the
+# validation loss will be identical whether we shuffle the validation set or not.
+# Since shuffling takes extra time, it makes no sense to shuffle the validation data.
+#
+# We'll use a batch size for the validation set that is twice as large as
+# that for the training set. This is because the validation set does not
+# need backpropagation and thus takes less memory (it doesn't need to
+# store the gradients). We take advantage of this to use a larger batch
+# size and compute the loss more quickly.
+
+train_ds = TensorDataset(x_train, y_train)
+train_dl = DataLoader(train_ds, batch_size=bs, shuffle=True)
+
+valid_ds = TensorDataset(x_valid, y_valid)
+valid_dl = DataLoader(valid_ds, batch_size=bs * 2)
+
+###############################################################################
+# We will calculate and print the validation loss at the end of each epoch.
+#
+# (Note that we always call ``model.train()`` before training, and ``model.eval()``
+# before inference, because these are used by layers such as ``nn.BatchNorm2d``
+# and ``nn.Dropout`` to ensure appropriate behavior for these different phases.)
+
+model, opt = get_model()
+
+for epoch in range(epochs):
+    model.train()
+    for xb, yb in train_dl:
+        pred = model(xb)
+        loss = loss_func(pred, yb)
+
+        loss.backward()
+        opt.step()
+        opt.zero_grad()
+
+    model.eval()
+    with torch.no_grad():
+        valid_loss = sum(loss_func(model(xb), yb) for xb, yb in valid_dl)
+
+    print(epoch, valid_loss / len(valid_dl))
+
+###############################################################################
+# Create fit() and get_data()
+# ----------------------------------
+#
+# We'll now do a little refactoring of our own. Since we go through a similar
+# process twice of calculating the loss for both the training set and the
+# validation set, let's make that into its own function, ``loss_batch``, which
+# computes the loss for one batch.
+#
+# We pass an optimizer in for the training set, and use it to perform
+# backprop.  For the validation set, we don't pass an optimizer, so the
+# method doesn't perform backprop.
+
+
+def loss_batch(model, loss_func, xb, yb, opt=None):
+    loss = loss_func(model(xb), yb)
+
+    if opt is not None:
+        loss.backward()
+        opt.step()
+        opt.zero_grad()
+
+    return loss.item(), len(xb)
+
+###############################################################################
+# ``fit`` runs the necessary operations to train our model and compute the
+# training and validation losses for each epoch.
+
+import numpy as np
+
+def fit(epochs, model, loss_func, opt, train_dl, valid_dl):
+    for epoch in range(epochs):
+        model.train()
+        for xb, yb in train_dl:
+            loss_batch(model, loss_func, xb, yb, opt)
+
+        model.eval()
+        with torch.no_grad():
+            losses, nums = zip(
+                *[loss_batch(model, loss_func, xb, yb) for xb, yb in valid_dl]
+            )
+        val_loss = np.sum(np.multiply(losses, nums)) / np.sum(nums)
+
+        print(epoch, val_loss)
+
+###############################################################################
+# ``get_data`` returns dataloaders for the training and validation sets.
+
+
+def get_data(train_ds, valid_ds, bs):
+    return (
+        DataLoader(train_ds, batch_size=bs, shuffle=True),
+        DataLoader(valid_ds, batch_size=bs * 2),
+    )
+
+###############################################################################
+# Now, our whole process of obtaining the data loaders and fitting the
+# model can be run in 3 lines of code:
+
+train_dl, valid_dl = get_data(train_ds, valid_ds, bs)
+model, opt = get_model()
+fit(epochs, model, loss_func, opt, train_dl, valid_dl)
+
+###############################################################################
+# You can use these basic 3 lines of code to train a wide variety of models.
+# Let's see if we can use them to train a convolutional neural network (CNN)!
+#
+# Switch to CNN
+# -------------
+#
+# We are now going to build our neural network with three convolutional layers.
+# Because none of the functions in the previous section assume anything about
+# the model form, we'll be able to use them to train a CNN without any modification.
+#
+# We will use PyTorch's predefined
+# `Conv2d <https://pytorch.org/docs/stable/nn.html#torch.nn.Conv2d>`_ class
+# as our convolutional layer. We define a CNN with 3 convolutional layers.
+# Each convolution is followed by a ReLU.  At the end, we perform an
+# average pooling.  (Note that ``view`` is PyTorch's version of Numpy's
+# ``reshape``)
+
+class Mnist_CNN(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv1 = nn.Conv2d(1, 16, kernel_size=3, stride=2, padding=1)
+        self.conv2 = nn.Conv2d(16, 16, kernel_size=3, stride=2, padding=1)
+        self.conv3 = nn.Conv2d(16, 10, kernel_size=3, stride=2, padding=1)
+
+    def forward(self, xb):
+        xb = xb.view(-1, 1, 28, 28)
+        xb = F.relu(self.conv1(xb))
+        xb = F.relu(self.conv2(xb))
+        xb = F.relu(self.conv3(xb))
+        xb = F.avg_pool2d(xb, 4)
+        return xb.view(-1, xb.size(1))
+
+lr = 0.1
+
+###############################################################################
+# `Momentum <https://cs231n.github.io/neural-networks-3/#sgd>`_ is a variation on
+# stochastic gradient descent that takes previous updates into account as well
+# and generally leads to faster training.
+
+model = Mnist_CNN()
+opt = optim.SGD(model.parameters(), lr=lr, momentum=0.9)
+
+fit(epochs, model, loss_func, opt, train_dl, valid_dl)
+
+###############################################################################
+# Using ``nn.Sequential``
+# ------------------------
+#
+# ``torch.nn`` has another handy class we can use to simplify our code:
+# `Sequential <https://pytorch.org/docs/stable/nn.html#torch.nn.Sequential>`_ .
+# A ``Sequential`` object runs each of the modules contained within it, in a
+# sequential manner. This is a simpler way of writing our neural network.
+#
+# To take advantage of this, we need to be able to easily define a
+# **custom layer** from a given function.  For instance, PyTorch doesn't
+# have a `view` layer, and we need to create one for our network. ``Lambda``
+# will create a layer that we can then use when defining a network with
+# ``Sequential``.
+
+class Lambda(nn.Module):
+    def __init__(self, func):
+        super().__init__()
+        self.func = func
+
+    def forward(self, x):
+        return self.func(x)
+
+
+def preprocess(x):
+    return x.view(-1, 1, 28, 28)
+
+###############################################################################
+# The model created with ``Sequential`` is simple:
+
+model = nn.Sequential(
+    Lambda(preprocess),
+    nn.Conv2d(1, 16, kernel_size=3, stride=2, padding=1),
+    nn.ReLU(),
+    nn.Conv2d(16, 16, kernel_size=3, stride=2, padding=1),
+    nn.ReLU(),
+    nn.Conv2d(16, 10, kernel_size=3, stride=2, padding=1),
+    nn.ReLU(),
+    nn.AvgPool2d(4),
+    Lambda(lambda x: x.view(x.size(0), -1)),
+)
+
+opt = optim.SGD(model.parameters(), lr=lr, momentum=0.9)
+
+fit(epochs, model, loss_func, opt, train_dl, valid_dl)
+
+###############################################################################
+# Wrapping ``DataLoader``
+# -----------------------------
+#
+# Our CNN is fairly concise, but it only works with MNIST, because:
+#  - It assumes the input is a 28\*28 long vector
+#  - It assumes that the final CNN grid size is 4\*4 (since that's the average pooling kernel size we used)
+#
+# Let's get rid of these two assumptions, so our model works with any 2d
+# single channel image. First, we can remove the initial Lambda layer by
+# moving the data preprocessing into a generator:
+
+def preprocess(x, y):
+    return x.view(-1, 1, 28, 28), y
+
+
+class WrappedDataLoader:
+    def __init__(self, dl, func):
+        self.dl = dl
+        self.func = func
+
+    def __len__(self):
+        return len(self.dl)
+
+    def __iter__(self):
+        for b in self.dl:
+            yield (self.func(*b))
+
+train_dl, valid_dl = get_data(train_ds, valid_ds, bs)
+train_dl = WrappedDataLoader(train_dl, preprocess)
+valid_dl = WrappedDataLoader(valid_dl, preprocess)
+
+###############################################################################
+# Next, we can replace ``nn.AvgPool2d`` with ``nn.AdaptiveAvgPool2d``, which
+# allows us to define the size of the *output* tensor we want, rather than
+# the *input* tensor we have. As a result, our model will work with any
+# size input.
+
+model = nn.Sequential(
+    nn.Conv2d(1, 16, kernel_size=3, stride=2, padding=1),
+    nn.ReLU(),
+    nn.Conv2d(16, 16, kernel_size=3, stride=2, padding=1),
+    nn.ReLU(),
+    nn.Conv2d(16, 10, kernel_size=3, stride=2, padding=1),
+    nn.ReLU(),
+    nn.AdaptiveAvgPool2d(1),
+    Lambda(lambda x: x.view(x.size(0), -1)),
+)
+
+opt = optim.SGD(model.parameters(), lr=lr, momentum=0.9)
+
+###############################################################################
+# Let's try it out:
+
+fit(epochs, model, loss_func, opt, train_dl, valid_dl)
+
+###############################################################################
+# Using your `Accelerator <https://pytorch.org/docs/stable/torch.html#accelerators>`__
+# ---------------
+#
+# If you're lucky enough to have access to an accelerator such as CUDA (you can
+# rent one for about $0.50/hour from most cloud providers) you can
+# use it to speed up your code. First check that your accelerator is working in
+# Pytorch:
+
+# If the current accelerator is available, we will use it. Otherwise, we use the CPU.
+device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu"
+print(f"Using {device} device")
+
+
+###############################################################################
+# Let's update ``preprocess`` to move batches to the accelerator:
+
+
+def preprocess(x, y):
+    return x.view(-1, 1, 28, 28).to(device), y.to(device)
+
+
+train_dl, valid_dl = get_data(train_ds, valid_ds, bs)
+train_dl = WrappedDataLoader(train_dl, preprocess)
+valid_dl = WrappedDataLoader(valid_dl, preprocess)
+
+###############################################################################
+# Finally, we can move our model to the accelerator.
+
+model.to(device)
+opt = optim.SGD(model.parameters(), lr=lr, momentum=0.9)
+
+###############################################################################
+# You should find it runs faster now:
+
+fit(epochs, model, loss_func, opt, train_dl, valid_dl)
+
+###############################################################################
+# Closing thoughts
+# -----------------
+#
+# We now have a general data pipeline and training loop which you can use for
+# training many types of models using Pytorch. To see how simple training a model
+# can now be, take a look at the `mnist_sample notebook <https://github.com/fastai/fastai_dev/blob/master/dev_nb/mnist_sample.ipynb>`__.
+#
+# Of course, there are many things you'll want to add, such as data augmentation,
+# hyperparameter tuning, monitoring training, transfer learning, and so forth.
+# These features are available in the fastai library, which has been developed
+# using the same design approach shown in this tutorial, providing a natural
+# next step for practitioners looking to take their models further.
+#
+# We promised at the start of this tutorial we'd explain through example each of
+# ``torch.nn``, ``torch.optim``, ``Dataset``, and ``DataLoader``. So let's summarize
+# what we've seen:
+#
+#  - ``torch.nn``:
+#
+#    + ``Module``: creates a callable which behaves like a function, but can also
+#      contain state(such as neural net layer weights). It knows what ``Parameter`` (s) it
+#      contains and can zero all their gradients, loop through them for weight updates, etc.
+#    + ``Parameter``: a wrapper for a tensor that tells a ``Module`` that it has weights
+#      that need updating during backprop. Only tensors with the `requires_grad` attribute set are updated
+#    + ``functional``: a module(usually imported into the ``F`` namespace by convention)
+#      which contains activation functions, loss functions, etc, as well as non-stateful
+#      versions of layers such as convolutional and linear layers.
+#  - ``torch.optim``: Contains optimizers such as ``SGD``, which update the weights
+#    of ``Parameter`` during the backward step
+#  - ``Dataset``: An abstract interface of objects with a ``__len__`` and a ``__getitem__``,
+#    including classes provided with Pytorch such as ``TensorDataset``
+#  - ``DataLoader``: Takes any ``Dataset`` and creates an iterator which returns batches of data.
diff --git a/beginner_source/onnx/README.txt b/beginner_source/onnx/README.txt
new file mode 100644
index 00000000000..96004a239ea
--- /dev/null
+++ b/beginner_source/onnx/README.txt
@@ -0,0 +1,18 @@
+ONNX
+----
+
+1. intro_onnx.py
+    Introduction to ONNX
+    https://pytorch.org/tutorials/beginner/onnx/intro_onnx.html
+
+2. export_simple_model_to_onnx_tutorial.py
+    Exporting a PyTorch model to ONNX
+    https://pytorch.org/tutorials/beginner/onnx/export_simple_model_to_onnx_tutorial.html
+
+3. onnx_registry_tutorial.py
+    Extending the ONNX exporter operator support
+    https://pytorch.org/tutorials/beginner/onnx/onnx_registry_tutorial.html
+
+4. export_control_flow_model_to_onnx_tutorial.py
+    Export a model with control flow to ONNX
+    https://pytorch.org/tutorials/beginner/onnx/export_control_flow_model_to_onnx_tutorial.html
\ No newline at end of file
diff --git a/beginner_source/onnx/export_control_flow_model_to_onnx_tutorial.py b/beginner_source/onnx/export_control_flow_model_to_onnx_tutorial.py
new file mode 100644
index 00000000000..c8057727132
--- /dev/null
+++ b/beginner_source/onnx/export_control_flow_model_to_onnx_tutorial.py
@@ -0,0 +1,172 @@
+# -*- coding: utf-8 -*-
+"""
+`Introduction to ONNX <intro_onnx.html>`_ ||
+`Exporting a PyTorch model to ONNX <export_simple_model_to_onnx_tutorial.html>`_ ||
+`Extending the ONNX exporter operator support <onnx_registry_tutorial.html>`_ ||
+**`Export a model with control flow to ONNX**
+
+Export a model with control flow to ONNX
+========================================
+
+**Author**: `Xavier Dupré <https://github.com/xadupre>`_
+"""
+
+
+###############################################################################
+# Overview
+# --------
+# 
+# This tutorial demonstrates how to handle control flow logic while exporting
+# a PyTorch model to ONNX. It highlights the challenges of exporting
+# conditional statements directly and provides solutions to circumvent them.
+#
+# Conditional logic cannot be exported into ONNX unless they refactored
+# to use :func:`torch.cond`. Let's start with a simple model
+# implementing a test.
+# 
+# What you will learn:
+#
+# - How to refactor the model to use :func:`torch.cond` for exporting.
+# - How to export a model with control flow logic to ONNX.
+# - How to optimize the exported model using the ONNX optimizer.
+#
+# Prerequisites
+# ~~~~~~~~~~~~~
+#
+# * ``torch >= 2.6``
+
+
+import torch
+
+###############################################################################
+# Define the Models
+# -----------------
+#
+# Two models are defined:
+#
+# ``ForwardWithControlFlowTest``: A model with a forward method containing an
+# if-else conditional.
+#
+# ``ModelWithControlFlowTest``: A model that incorporates ``ForwardWithControlFlowTest``
+# as part of a simple MLP. The models are tested with
+# a random input tensor to confirm they execute as expected.
+
+class ForwardWithControlFlowTest(torch.nn.Module):
+    def forward(self, x):
+        if x.sum():
+            return x * 2
+        return -x
+
+
+class ModelWithControlFlowTest(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.mlp = torch.nn.Sequential(
+            torch.nn.Linear(3, 2),
+            torch.nn.Linear(2, 1),
+            ForwardWithControlFlowTest(),
+        )
+
+    def forward(self, x):
+        out = self.mlp(x)
+        return out
+
+
+model = ModelWithControlFlowTest()
+
+
+###############################################################################
+# Exporting the Model: First Attempt
+# ----------------------------------
+#
+# Exporting this model using torch.export.export fails because the control
+# flow logic in the forward pass creates a graph break that the exporter cannot
+# handle. This behavior is expected, as conditional logic not written using
+# :func:`torch.cond` is unsupported.
+# 
+# A try-except block is used to capture the expected failure during the export
+# process. If the export unexpectedly succeeds, an ``AssertionError`` is raised.
+
+x = torch.randn(3)
+model(x)
+
+try:
+    torch.export.export(model, (x,), strict=False)
+    raise AssertionError("This export should failed unless PyTorch now supports this model.")
+except Exception as e:
+    print(e)
+
+
+###############################################################################
+# Suggested Patch: Refactoring with :func:`torch.cond`
+# --------------------------------------------
+#
+# To make the control flow exportable, the tutorial demonstrates replacing the
+# forward method in ``ForwardWithControlFlowTest`` with a refactored version that
+# uses :func:`torch.cond``.
+#
+# Details of the Refactoring:
+#
+# Two helper functions (identity2 and neg) represent the branches of the conditional logic:
+# * :func:`torch.cond`` is used to specify the condition and the two branches along with the input arguments.
+# * The updated forward method is then dynamically assigned to the ``ForwardWithControlFlowTest`` instance within the model. A list of submodules is printed to confirm the replacement.
+
+def new_forward(x):
+    def identity2(x):
+        return x * 2
+
+    def neg(x):
+        return -x
+
+    return torch.cond(x.sum() > 0, identity2, neg, (x,))
+
+
+print("the list of submodules")
+for name, mod in model.named_modules():
+    print(name, type(mod))
+    if isinstance(mod, ForwardWithControlFlowTest):
+        mod.forward = new_forward
+
+###############################################################################
+# Let's see what the FX graph looks like.
+
+print(torch.export.export(model, (x,), strict=False))  
+
+###############################################################################
+# Let's export again.
+
+onnx_program = torch.onnx.export(model, (x,), dynamo=True)  
+print(onnx_program.model) 
+
+
+###############################################################################
+# We can optimize the model and get rid of the model local functions created to capture the control flow branches.  
+
+onnx_program.optimize()  
+print(onnx_program.model)  
+
+###############################################################################
+# Conclusion
+# ----------
+#
+# This tutorial demonstrates the challenges of exporting models with conditional
+# logic to ONNX and presents a practical solution using :func:`torch.cond`.
+# While the default exporters may fail or produce imperfect graphs, refactoring the
+# model's logic ensures compatibility and generates a faithful ONNX representation.
+#
+# By understanding these techniques, we can overcome common pitfalls when
+# working with control flow in PyTorch models and ensure smooth integration with ONNX workflows.
+#
+# Further reading
+# ---------------
+#
+# The list below refers to tutorials that ranges from basic examples to advanced scenarios,
+# not necessarily in the order they are listed.
+# Feel free to jump directly to specific topics of your interest or
+# sit tight and have fun going through all of them to learn all there is about the ONNX exporter.
+#
+# .. include:: /beginner_source/onnx/onnx_toc.txt
+#
+# .. toctree::
+#    :hidden:
+#
diff --git a/beginner_source/onnx/export_simple_model_to_onnx_tutorial.py b/beginner_source/onnx/export_simple_model_to_onnx_tutorial.py
new file mode 100644
index 00000000000..8948cbaa2c1
--- /dev/null
+++ b/beginner_source/onnx/export_simple_model_to_onnx_tutorial.py
@@ -0,0 +1,212 @@
+# -*- coding: utf-8 -*-
+"""
+`Introduction to ONNX <intro_onnx.html>`_ ||
+**Exporting a PyTorch model to ONNX** ||
+`Extending the ONNX exporter operator support <onnx_registry_tutorial.html>`_ ||
+`Export a model with control flow to ONNX <export_control_flow_model_to_onnx_tutorial.html>`_
+
+Export a PyTorch model to ONNX
+==============================
+
+**Author**: `Ti-Tai Wang <https://github.com/titaiwangms>`_, `Justin Chu <justinchu@microsoft.com>`_, `Thiago Crepaldi <https://github.com/thiagocrepaldi>`_.
+
+.. note::
+    Starting with PyTorch 2.5, there are two ONNX Exporter options available.
+    * ``torch.onnx.export(..., dynamo=True)`` is the recommended exporter that leverages ``torch.export`` and Torch FX for graph capture.
+    * ``torch.onnx.export`` is the legacy approach that relies on the deprecated TorchScript and is no longer recommended for use.
+
+"""
+
+###############################################################################
+# In the `60 Minute Blitz <https://pytorch.org/tutorials/beginner/deep_learning_60min_blitz.html>`_,
+# we had the opportunity to learn about PyTorch at a high level and train a small neural network to classify images.
+# In this tutorial, we are going to expand this to describe how to convert a model defined in PyTorch into the
+# ONNX format using the ``torch.onnx.export(..., dynamo=True)`` ONNX exporter.
+#
+# While PyTorch is great for iterating on the development of models, the model can be deployed to production
+# using different formats, including `ONNX <https://onnx.ai/>`_ (Open Neural Network Exchange)!
+#
+# ONNX is a flexible open standard format for representing machine learning models which standardized representations
+# of machine learning allow them to be executed across a gamut of hardware platforms and runtime environments
+# from large-scale cloud-based supercomputers to resource-constrained edge devices, such as your web browser and phone.
+#
+# In this tutorial, we’ll learn how to:
+#
+# 1. Install the required dependencies.
+# 2. Author a simple image classifier model.
+# 3. Export the model to ONNX format.
+# 4. Save the ONNX model in a file.
+# 5. Visualize the ONNX model graph using `Netron <https://github.com/lutzroeder/netron>`_.
+# 6. Execute the ONNX model with `ONNX Runtime`
+# 7. Compare the PyTorch results with the ones from the ONNX Runtime.
+#
+# 1. Install the required dependencies
+# ------------------------------------
+# Because the ONNX exporter uses ``onnx`` and ``onnxscript`` to translate PyTorch operators into ONNX operators,
+# we will need to install them.
+#
+#  .. code-block:: bash
+#
+#   pip install --upgrade onnx onnxscript
+#
+# 2. Author a simple image classifier model
+# -----------------------------------------
+#
+# Once your environment is set up, let’s start modeling our image classifier with PyTorch,
+# exactly like we did in the `60 Minute Blitz <https://pytorch.org/tutorials/beginner/deep_learning_60min_blitz.html>`_.
+#
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class ImageClassifierModel(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv1 = nn.Conv2d(1, 6, 5)
+        self.conv2 = nn.Conv2d(6, 16, 5)
+        self.fc1 = nn.Linear(16 * 5 * 5, 120)
+        self.fc2 = nn.Linear(120, 84)
+        self.fc3 = nn.Linear(84, 10)
+
+    def forward(self, x: torch.Tensor):
+        x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
+        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
+        x = torch.flatten(x, 1)
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        x = self.fc3(x)
+        return x
+
+
+######################################################################
+# 3. Export the model to ONNX format
+# ----------------------------------
+#
+# Now that we have our model defined, we need to instantiate it and create a random 32x32 input.
+# Next, we can export the model to ONNX format.
+
+torch_model = ImageClassifierModel()
+# Create example inputs for exporting the model. The inputs should be a tuple of tensors.
+example_inputs = (torch.randn(1, 1, 32, 32),)
+onnx_program = torch.onnx.export(torch_model, example_inputs, dynamo=True)
+
+######################################################################
+# As we can see, we didn't need any code change to the model.
+# The resulting ONNX model is stored within ``torch.onnx.ONNXProgram`` as a binary protobuf file.
+#
+# 4. Save the ONNX model in a file
+# --------------------------------
+#
+# Although having the exported model loaded in memory is useful in many applications,
+# we can save it to disk with the following code:
+
+onnx_program.save("image_classifier_model.onnx")
+
+######################################################################
+# You can load the ONNX file back into memory and check if it is well formed with the following code:
+
+import onnx
+
+onnx_model = onnx.load("image_classifier_model.onnx")
+onnx.checker.check_model(onnx_model)
+
+######################################################################
+# 5. Visualize the ONNX model graph using Netron
+# ----------------------------------------------
+#
+# Now that we have our model saved in a file, we can visualize it with `Netron <https://github.com/lutzroeder/netron>`_.
+# Netron can either be installed on macos, Linux or Windows computers, or run directly from the browser.
+# Let's try the web version by opening the following link: https://netron.app/.
+#
+# .. image:: ../../_static/img/onnx/netron_web_ui.png
+#   :width: 70%
+#   :align: center
+#
+#
+# Once Netron is open, we can drag and drop our ``image_classifier_model.onnx`` file into the browser or select it after
+# clicking the **Open model** button.
+#
+# .. image:: ../../_static/img/onnx/image_classifier_onnx_model_on_netron_web_ui.png
+#   :width: 50%
+#
+#
+# And that is it! We have successfully exported our PyTorch model to ONNX format and visualized it with Netron.
+#
+# 6. Execute the ONNX model with ONNX Runtime
+# -------------------------------------------
+#
+# The last step is executing the ONNX model with `ONNX Runtime`, but before we do that, let's install ONNX Runtime.
+#
+#  .. code-block:: bash
+#
+#   pip install onnxruntime
+#
+# The ONNX standard does not support all the data structure and types that PyTorch does,
+# so we need to adapt PyTorch input's to ONNX format before feeding it to ONNX Runtime.
+# In our example, the input happens to be the same, but it might have more inputs
+# than the original PyTorch model in more complex models.
+#
+# ONNX Runtime requires an additional step that involves converting all PyTorch tensors to Numpy (in CPU)
+# and wrap them on a dictionary with keys being a string with the input name as key and the numpy tensor as the value.
+#
+# Now we can create an *ONNX Runtime Inference Session*, execute the ONNX model with the processed input
+# and get the output. In this tutorial, ONNX Runtime is executed on CPU, but it could be executed on GPU as well.
+
+import onnxruntime
+
+onnx_inputs = [tensor.numpy(force=True) for tensor in example_inputs]
+print(f"Input length: {len(onnx_inputs)}")
+print(f"Sample input: {onnx_inputs}")
+
+ort_session = onnxruntime.InferenceSession(
+    "./image_classifier_model.onnx", providers=["CPUExecutionProvider"]
+)
+
+onnxruntime_input = {input_arg.name: input_value for input_arg, input_value in zip(ort_session.get_inputs(), onnx_inputs)}
+
+# ONNX Runtime returns a list of outputs
+onnxruntime_outputs = ort_session.run(None, onnxruntime_input)[0]
+
+####################################################################
+# 7. Compare the PyTorch results with the ones from the ONNX Runtime
+# ------------------------------------------------------------------
+#
+# The best way to determine whether the exported model is looking good is through numerical evaluation
+# against PyTorch, which is our source of truth.
+#
+# For that, we need to execute the PyTorch model with the same input and compare the results with ONNX Runtime's.
+# Before comparing the results, we need to convert the PyTorch's output to match ONNX's format.
+
+torch_outputs = torch_model(*example_inputs)
+
+assert len(torch_outputs) == len(onnxruntime_outputs)
+for torch_output, onnxruntime_output in zip(torch_outputs, onnxruntime_outputs):
+    torch.testing.assert_close(torch_output, torch.tensor(onnxruntime_output))
+
+print("PyTorch and ONNX Runtime output matched!")
+print(f"Output length: {len(onnxruntime_outputs)}")
+print(f"Sample output: {onnxruntime_outputs}")
+
+######################################################################
+# Conclusion
+# ----------
+#
+# That is about it! We have successfully exported our PyTorch model to ONNX format,
+# saved the model to disk, viewed it using Netron, executed it with ONNX Runtime
+# and finally compared its numerical results with PyTorch's.
+#
+# Further reading
+# ---------------
+#
+# The list below refers to tutorials that ranges from basic examples to advanced scenarios,
+# not necessarily in the order they are listed.
+# Feel free to jump directly to specific topics of your interest or
+# sit tight and have fun going through all of them to learn all there is about the ONNX exporter.
+#
+# .. include:: /beginner_source/onnx/onnx_toc.txt
+#
+# .. toctree::
+#    :hidden:
+#
diff --git a/beginner_source/onnx/intro_onnx.py b/beginner_source/onnx/intro_onnx.py
new file mode 100644
index 00000000000..ecb0be97bf2
--- /dev/null
+++ b/beginner_source/onnx/intro_onnx.py
@@ -0,0 +1,73 @@
+"""
+**Introduction to ONNX** ||
+`Exporting a PyTorch model to ONNX <export_simple_model_to_onnx_tutorial.html>`_ ||
+`Extending the ONNX exporter operator support <onnx_registry_tutorial.html>`_ ||
+`Export a model with control flow to ONNX <export_control_flow_model_to_onnx_tutorial.html>`_
+
+Introduction to ONNX
+====================
+
+Authors:
+`Ti-Tai Wang <https://github.com/titaiwangms>`_, `Thiago Crepaldi <https://github.com/thiagocrepaldi>`_.
+
+`Open Neural Network eXchange (ONNX) <https://onnx.ai/>`_ is an open standard
+format for representing machine learning models. The ``torch.onnx`` module provides APIs to
+capture the computation graph from a native PyTorch :class:`torch.nn.Module` model and convert
+it into an `ONNX graph <https://github.com/onnx/onnx/blob/main/docs/IR.md>`_.
+
+The exported model can be consumed by any of the many
+`runtimes that support ONNX <https://onnx.ai/supported-tools.html#deployModel>`_,
+including Microsoft's `ONNX Runtime <https://www.onnxruntime.ai>`_.
+
+When setting ``dynamo=True``, the exporter will use `torch.export <https://pytorch.org/docs/stable/export.html>`_ to capture an ``ExportedProgram``,
+before translating the graph into ONNX representations. This approach is the new and recommended way to export models to ONNX.
+It works with PyTorch 2.0 features more robustly, has better support for newer ONNX operator sets, and consumes less resources
+to make exporting larger models possible.
+
+Dependencies
+------------
+
+PyTorch 2.5.0 or newer is required.
+
+The ONNX exporter depends on extra Python packages:
+
+  - `ONNX <https://onnx.ai>`_ standard library
+  - `ONNX Script <https://onnxscript.ai>`_ library that enables developers to author ONNX operators,
+    functions and models using a subset of Python in an expressive, and yet simple fashion
+  - `ONNX Runtime <https://onnxruntime.ai>`_ accelerated machine learning library.
+
+They can be installed through `pip <https://pypi.org/project/pip/>`_:
+
+.. code-block:: bash
+
+  pip install --upgrade onnx onnxscript onnxruntime
+
+To validate the installation, run the following commands:
+
+.. code-block:: python
+
+  import torch
+  print(torch.__version__)
+
+  import onnxscript
+  print(onnxscript.__version__)
+
+  import onnxruntime
+  print(onnxruntime.__version__)
+
+Each `import` must succeed without any errors and the library versions must be printed out.
+
+Further reading
+---------------
+
+The list below refers to tutorials that ranges from basic examples to advanced scenarios,
+not necessarily in the order they are listed.
+Feel free to jump directly to specific topics of your interest or
+sit tight and have fun going through all of them to learn all there is about the ONNX exporter.
+
+.. include:: /beginner_source/onnx/onnx_toc.txt
+
+.. toctree::
+   :hidden:
+
+"""
\ No newline at end of file
diff --git a/beginner_source/onnx/onnx_registry_tutorial.py b/beginner_source/onnx/onnx_registry_tutorial.py
new file mode 100644
index 00000000000..e82bc6257de
--- /dev/null
+++ b/beginner_source/onnx/onnx_registry_tutorial.py
@@ -0,0 +1,271 @@
+# -*- coding: utf-8 -*-
+"""
+`Introduction to ONNX <intro_onnx.html>`_ ||
+`Exporting a PyTorch model to ONNX <export_simple_model_to_onnx_tutorial.html>`_ ||
+**Extending the ONNX exporter operator support** ||
+`Export a model with control flow to ONNX <export_control_flow_model_to_onnx_tutorial.html>`_
+
+Extending the ONNX Exporter Operator Support
+============================================
+
+**Authors:** `Ti-Tai Wang <titaiwang@microsoft.com>`_, `Justin Chu <justinchu@microsoft.com>`_
+"""
+
+
+###############################################################################
+# Overview
+# --------
+#
+# This tutorial describes how you can create ONNX implementation for unsupported PyTorch operators
+# or replace existing implementation with your own.
+#
+# We will cover three scenarios that require extending the ONNX exporter's operator support:
+#
+# * Overriding the implementation of an existing PyTorch operator
+# * Using custom ONNX operators
+# * Supporting a custom PyTorch operator
+#
+# What you will learn:
+#
+# - How to override or add support for PyTorch operators in ONNX.
+# - How to integrate custom ONNX operators for specialized runtimes.
+# - How to implement and translate custom PyTorch operators to ONNX.
+#
+# Prerequisites
+# ~~~~~~~~~~~~~
+#
+# Before starting this tutorial, make sure you have completed the following prerequisites:
+#
+# * ``torch >= 2.6``
+# * The target PyTorch operator
+# * Completed the
+#   `ONNX Script tutorial <https://github.com/microsoft/onnxscript/blob/main/docs/tutorial/index.md>`_
+#   before proceeding
+# * The implementation of the operator using `ONNX Script <https://github.com/microsoft/onnxscript>`__
+#
+# Overriding the implementation of an existing PyTorch operator
+# -------------------------------------------------------------
+#
+# Although the ONNX exporter team does their best efforts to support all PyTorch operators, some of them
+# might not be supported yet. In this section, we will demonstrate how you can add
+# unsupported PyTorch operators to the ONNX Registry.
+#
+# .. note::
+#       The steps to implement unsupported PyTorch operators are the same as those for replacing the implementation of an existing 
+#       PyTorch operator with a custom one.  
+#       Because we don't actually have an unsupported PyTorch operator to use in this tutorial, we are going to leverage
+#       this and replace the implementation of ``torch.ops.aten.add.Tensor`` with a custom implementation the same way we would
+#       if the operator was not implemented by the ONNX exporter.
+#
+# When a model cannot be exported to ONNX due to an unsupported operator, the ONNX exporter will show an error message
+# similar to:
+#
+# .. code-block:: python
+#
+#   No decompositions registered for [...]
+#
+# The error message indicates that the unsupported PyTorch operator is ``torch.ops.aten.add.Tensor``.
+# The operator is of type ``<class 'torch._ops.OpOverload'>``, and this operator is what we will use as the
+# target to register our custom implementation.
+
+import torch
+import onnxscript
+
+# Opset 18 is the standard supported version as of PyTorch 2.6
+from onnxscript import opset18 as op
+
+
+# Create a model that uses the operator torch.ops.aten.add.Tensor
+class Model(torch.nn.Module):
+    def forward(self, input_x, input_y):
+        return torch.ops.aten.add.Tensor(input_x, input_y)
+
+
+# NOTE: The function signature (including parameter names) must match the signature of the unsupported PyTorch operator.
+# https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/native_functions.yaml
+# All attributes must be annotated with type hints.
+def custom_aten_add(self, other, alpha: float = 1.0):
+    if alpha != 1.0:
+        alpha = op.CastLike(alpha, other)
+        other = op.Mul(other, alpha)
+    # To distinguish the custom implementation from the builtin one, we switch the order of the inputs
+    return op.Add(other, self)
+
+
+x = torch.tensor([1.0])
+y = torch.tensor([2.0])
+
+# Then we provide the custom implementation to the ONNX exporter as a ``custom_translation_table``.
+onnx_program = torch.onnx.export(
+    Model().eval(),
+    (x, y),
+    dynamo=True,
+    custom_translation_table={
+        torch.ops.aten.add.Tensor: custom_aten_add,
+    },
+)
+# Optimize the ONNX graph to remove redundant nodes
+onnx_program.optimize()
+
+######################################################################
+# Now let's inspect the model and verify the model is using the custom implementation.
+
+print(onnx_program.model)
+
+######################################################################
+# The translation is using our custom implementation: In node ``node_Add_0``, ``input_y`` now
+# comes first, and ``input_x`` comes second.
+#
+# We can use ONNX Runtime to run the model and verify the results by calling
+# the :class:`torch.onnx.ONNXProgram` directly on the input tensors.
+
+result = onnx_program(x, y)[0]
+torch.testing.assert_close(result, torch.tensor([3.0]))
+
+
+######################################################################
+# Using custom ONNX operators
+# ---------------------------
+#
+# In this case, we create a model with standard PyTorch operators, but the runtime
+# (such as Microsoft's ONNX Runtime) can provide a custom implementation for that kernel, effectively replacing the
+# existing implementation.
+#
+# In the following example, we use the ``com.microsoft.Gelu`` operator provided by ONNX Runtime,
+# which is not the same ``Gelu`` from ONNX spec.
+
+
+class GeluModel(torch.nn.Module):
+    def forward(self, input_x):
+        return torch.ops.aten.gelu(input_x)
+
+
+# Create a namespace for the custom operator using ONNX Script
+# ``com.microsoft`` is an official ONNX Runtime namespace
+microsoft_op = onnxscript.values.Opset(domain="com.microsoft", version=1)
+
+# NOTE: The function signature (including parameter names) must match the signature of the unsupported PyTorch operator.
+# https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/native_functions.yaml
+# NOTE: All attributes must be annotated with type hints.
+# The function must be scripted using the ``@onnxscript.script()`` decorator when
+# using operators from custom domains. This may be improved in future versions.
+from onnxscript import FLOAT
+
+
+@onnxscript.script(microsoft_op)
+def custom_aten_gelu(self: FLOAT, approximate: str = "none") -> FLOAT:
+    return microsoft_op.Gelu(self)
+
+
+onnx_program = torch.onnx.export(
+    GeluModel().eval(),
+    (x,),
+    dynamo=True,
+    custom_translation_table={
+        torch.ops.aten.gelu.default: custom_aten_gelu,
+    },
+)
+
+# Optimize the ONNX graph to remove redundant nodes
+onnx_program.optimize()
+
+
+######################################################################
+# Let's inspect the model and verify the model uses op_type ``Gelu``
+# from namespace ``com.microsoft``.
+#
+
+print(onnx_program.model)
+
+######################################################################
+# Similar to the previous example, we can use ONNX Runtime to run the model and verify the results.
+
+result = onnx_program(x)[0]
+torch.testing.assert_close(result, torch.ops.aten.gelu(x))
+
+
+######################################################################
+# Supporting a custom PyTorch operator
+# ------------------------------------
+#
+# In this case, the operator is an operator that is user implemented and registered to PyTorch.
+#
+# In the following example, we would like to use a custom operator
+# that takes one tensor input, and returns one output. The operator adds
+# the input to itself, and returns the rounded result.
+#
+# Firstly, we assume the custom operator is implemented and registered with ``torch.library.custom_op()``.
+# You can refer to `Creating new custom ops in Python <https://pytorch.org/docs/stable/library.html#torch.library.custom_op>`_
+# for a detailed guide on how to create custom operators.
+
+
+# Define and use the operator in PyTorch
+@torch.library.custom_op("mylibrary::add_and_round_op", mutates_args=())
+def add_and_round_op(input: torch.Tensor) -> torch.Tensor:
+    return torch.round(input + input)
+
+
+@add_and_round_op.register_fake
+def _add_and_round_op_fake(tensor_x):
+    return torch.empty_like(tensor_x)
+
+
+class AddAndRoundModel(torch.nn.Module):
+    def forward(self, input):
+        return add_and_round_op(input)
+
+
+# Implement the custom operator in ONNX using ONNX Script
+def onnx_add_and_round(input):
+    return op.Round(op.Add(input, input))
+
+
+onnx_program = torch.onnx.export(
+    AddAndRoundModel().eval(),
+    (x,),
+    dynamo=True,
+    custom_translation_table={
+        torch.ops.mylibrary.add_and_round_op.default: onnx_add_and_round,
+    },
+)
+
+# Optimize the ONNX graph to remove redundant nodes
+onnx_program.optimize()
+print(onnx_program)
+
+######################################################################
+# The translation is using our custom implementation to translate the ``torch.ops.mylibrary.add_and_round_op.default``
+# operator in the :class:`torch.export.ExportedProgram`` to the ONNX operator ``Add`` and ``Round``.
+#
+
+######################################################################
+# Finally we verify the results.
+
+result = onnx_program(x)[0]
+torch.testing.assert_close(result, add_and_round_op(x))
+
+######################################################################
+# Conclusion
+# ----------
+#
+# Congratulations! In this tutorial, we explored the ``custom_translation_table`` option and
+# discovered how to create custom implementations for unsupported or existing PyTorch operators
+# using ONNX Script.
+#
+# Finally, we leveraged ONNX Runtime to execute the model and compare the results with PyTorch,
+# providing us with a comprehensive understanding of handling unsupported
+# operators in the ONNX ecosystem.
+#
+# Further reading
+# ---------------
+#
+# The list below refers to tutorials that ranges from basic examples to advanced scenarios,
+# not necessarily in the order they are listed.
+# Feel free to jump directly to specific topics of your interest or
+# sit tight and have fun going through all of them to learn all there is about the ONNX exporter.
+#
+# .. include:: /beginner_source/onnx/onnx_toc.txt
+#
+# .. toctree::
+#    :hidden:
+#
diff --git a/beginner_source/onnx/onnx_toc.txt b/beginner_source/onnx/onnx_toc.txt
new file mode 100644
index 00000000000..ac293fbedd7
--- /dev/null
+++ b/beginner_source/onnx/onnx_toc.txt
@@ -0,0 +1,3 @@
+| 1. `Exporting a PyTorch model to ONNX <export_simple_model_to_onnx_tutorial.html>`_
+| 2. `Extending the ONNX exporter operator support <onnx_registry_tutorial.html>`_
+| 3. `Export a model with control flow to ONNX <export_control_flow_model_to_onnx_tutorial.html>`_
\ No newline at end of file
diff --git a/beginner_source/profiler.py b/beginner_source/profiler.py
new file mode 100644
index 00000000000..b395edbaca6
--- /dev/null
+++ b/beginner_source/profiler.py
@@ -0,0 +1,321 @@
+"""
+Profiling your PyTorch Module
+-----------------------------
+
+**Author:** `Suraj Subramanian <https://github.com/suraj813>`_
+
+PyTorch includes a profiler API that is useful to identify the time and
+memory costs of various PyTorch operations in your code. Profiler can be
+easily integrated in your code, and the results can be printed as a table
+or returned in a JSON trace file.
+
+.. note::
+    Profiler supports multithreaded models. Profiler runs in the
+    same thread as the operation but it will also profile child operators
+    that might run in another thread. Concurrently-running profilers will be
+    scoped to their own thread to prevent mixing of results.
+
+.. note::
+    PyTorch 1.8 introduces the new API that will replace the older profiler API
+    in the future releases. Check the new API at `this page <https://pytorch.org/docs/master/profiler.html>`__.
+
+Head on over to `this
+recipe <https://pytorch.org/tutorials/recipes/recipes/profiler_recipe.html>`__
+for a quicker walkthrough of Profiler API usage.
+
+
+--------------
+"""
+
+import torch
+import numpy as np
+from torch import nn
+import torch.autograd.profiler as profiler
+
+
+######################################################################
+# Performance debugging using Profiler
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# Profiler can be useful to identify performance bottlenecks in your
+# models. In this example, we build a custom module that performs two
+# sub-tasks:
+#
+# - a linear transformation on the input, and
+# - use the transformation result to get indices on a mask tensor.
+#
+# We wrap the code for each sub-task in separate labelled context managers using
+# ``profiler.record_function("label")``. In the profiler output, the
+# aggregate performance metrics of all operations in the sub-task will
+# show up under its corresponding label.
+#
+#
+# Note that using Profiler incurs some overhead, and is best used only for investigating
+# code. Remember to remove it if you are benchmarking runtimes.
+#
+
+class MyModule(nn.Module):
+    def __init__(self, in_features: int, out_features: int, bias: bool = True):
+        super(MyModule, self).__init__()
+        self.linear = nn.Linear(in_features, out_features, bias)
+
+    def forward(self, input, mask):
+        with profiler.record_function("LINEAR PASS"):
+            out = self.linear(input)
+
+        with profiler.record_function("MASK INDICES"):
+            threshold = out.sum(axis=1).mean().item()
+            hi_idx = np.argwhere(mask.cpu().numpy() > threshold)
+            hi_idx = torch.from_numpy(hi_idx).cuda()
+
+        return out, hi_idx
+
+
+######################################################################
+# Profile the forward pass
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# We initialize random input and mask tensors, and the model.
+#
+# Before we run the profiler, we warm-up CUDA to ensure accurate
+# performance benchmarking. We wrap the forward pass of our module in the
+# ``profiler.profile`` context manager. The ``with_stack=True`` parameter appends the
+# file and line number of the operation in the trace.
+#
+# .. warning::
+#     ``with_stack=True`` incurs an additional overhead, and is better suited for investigating code.
+#     Remember to remove it if you are benchmarking performance.
+#
+
+model = MyModule(500, 10).cuda()
+input = torch.rand(128, 500).cuda()
+mask = torch.rand((500, 500, 500), dtype=torch.double).cuda()
+
+# warm-up
+model(input, mask)
+
+with profiler.profile(with_stack=True, profile_memory=True) as prof:
+    out, idx = model(input, mask)
+
+
+######################################################################
+# Print profiler results
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# Finally, we print the profiler results. ``profiler.key_averages``
+# aggregates the results by operator name, and optionally by input
+# shapes and/or stack trace events.
+# Grouping by input shapes is useful to identify which tensor shapes
+# are utilized by the model.
+#
+# Here, we use ``group_by_stack_n=5`` which aggregates runtimes by the
+# operation and its traceback (truncated to the most recent 5 events), and
+# display the events in the order they are registered. The table can also
+# be sorted by passing a ``sort_by`` argument (refer to the
+# `docs <https://pytorch.org/docs/stable/autograd.html#profiler>`__ for
+# valid sorting keys).
+#
+# .. note::
+#   When running profiler in a notebook, you might see entries like ``<ipython-input-18-193a910735e8>(13): forward``
+#   instead of filenames in the stacktrace. These correspond to ``<notebook-cell>(line number): calling-function``.
+
+print(prof.key_averages(group_by_stack_n=5).table(sort_by='self_cpu_time_total', row_limit=5))
+
+"""
+(Some columns are omitted)
+
+-------------  ------------  ------------  ------------  ---------------------------------
+         Name    Self CPU %      Self CPU  Self CPU Mem   Source Location
+-------------  ------------  ------------  ------------  ---------------------------------
+ MASK INDICES        87.88%        5.212s    -953.67 Mb  /mnt/xarfuse/.../torch/au
+                                                         <ipython-input-...>(10): forward
+                                                         /mnt/xarfuse/.../torch/nn
+                                                         <ipython-input-...>(9): <module>
+                                                         /mnt/xarfuse/.../IPython/
+
+  aten::copy_        12.07%     715.848ms           0 b  <ipython-input-...>(12): forward
+                                                         /mnt/xarfuse/.../torch/nn
+                                                         <ipython-input-...>(9): <module>
+                                                         /mnt/xarfuse/.../IPython/
+                                                         /mnt/xarfuse/.../IPython/
+
+  LINEAR PASS         0.01%     350.151us         -20 b  /mnt/xarfuse/.../torch/au
+                                                         <ipython-input-...>(7): forward
+                                                         /mnt/xarfuse/.../torch/nn
+                                                         <ipython-input-...>(9): <module>
+                                                         /mnt/xarfuse/.../IPython/
+
+  aten::addmm         0.00%     293.342us           0 b  /mnt/xarfuse/.../torch/nn
+                                                         /mnt/xarfuse/.../torch/nn
+                                                         /mnt/xarfuse/.../torch/nn
+                                                         <ipython-input-...>(8): forward
+                                                         /mnt/xarfuse/.../torch/nn
+
+   aten::mean         0.00%     235.095us           0 b  <ipython-input-...>(11): forward
+                                                         /mnt/xarfuse/.../torch/nn
+                                                         <ipython-input-...>(9): <module>
+                                                         /mnt/xarfuse/.../IPython/
+                                                         /mnt/xarfuse/.../IPython/
+
+-----------------------------  ------------  ---------- ----------------------------------
+Self CPU time total: 5.931s
+
+"""
+
+######################################################################
+# Improve memory performance
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# Note that the most expensive operations - in terms of memory and time -
+# are at ``forward (10)`` representing the operations within MASK INDICES. Let’s try to
+# tackle the memory consumption first. We can see that the ``.to()``
+# operation at line 12 consumes 953.67 Mb. This operation copies ``mask`` to the CPU.
+# ``mask`` is initialized with a ``torch.double`` datatype. Can we reduce the memory footprint by casting
+# it to ``torch.float`` instead?
+#
+
+model = MyModule(500, 10).cuda()
+input = torch.rand(128, 500).cuda()
+mask = torch.rand((500, 500, 500), dtype=torch.float).cuda()
+
+# warm-up
+model(input, mask)
+
+with profiler.profile(with_stack=True, profile_memory=True) as prof:
+    out, idx = model(input, mask)
+
+print(prof.key_averages(group_by_stack_n=5).table(sort_by='self_cpu_time_total', row_limit=5))
+
+"""
+(Some columns are omitted)
+
+-----------------  ------------  ------------  ------------  --------------------------------
+             Name    Self CPU %      Self CPU  Self CPU Mem   Source Location
+-----------------  ------------  ------------  ------------  --------------------------------
+     MASK INDICES        93.61%        5.006s    -476.84 Mb  /mnt/xarfuse/.../torch/au
+                                                             <ipython-input-...>(10): forward
+                                                             /mnt/xarfuse/  /torch/nn
+                                                             <ipython-input-...>(9): <module>
+                                                             /mnt/xarfuse/.../IPython/
+
+      aten::copy_         6.34%     338.759ms           0 b  <ipython-input-...>(12): forward
+                                                             /mnt/xarfuse/.../torch/nn
+                                                             <ipython-input-...>(9): <module>
+                                                             /mnt/xarfuse/.../IPython/
+                                                             /mnt/xarfuse/.../IPython/
+
+ aten::as_strided         0.01%     281.808us           0 b  <ipython-input-...>(11): forward
+                                                             /mnt/xarfuse/.../torch/nn
+                                                             <ipython-input-...>(9): <module>
+                                                             /mnt/xarfuse/.../IPython/
+                                                             /mnt/xarfuse/.../IPython/
+
+      aten::addmm         0.01%     275.721us           0 b  /mnt/xarfuse/.../torch/nn
+                                                             /mnt/xarfuse/.../torch/nn
+                                                             /mnt/xarfuse/.../torch/nn
+                                                             <ipython-input-...>(8): forward
+                                                             /mnt/xarfuse/.../torch/nn
+
+      aten::_local        0.01%     268.650us           0 b  <ipython-input-...>(11): forward
+      _scalar_dense                                          /mnt/xarfuse/.../torch/nn
+                                                             <ipython-input-...>(9): <module>
+                                                             /mnt/xarfuse/.../IPython/
+                                                             /mnt/xarfuse/.../IPython/
+
+-----------------  ------------  ------------  ------------  --------------------------------
+Self CPU time total: 5.347s
+
+"""
+
+######################################################################
+#
+# The CPU memory footprint for this operation has halved.
+#
+# Improve time performance
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# While the time consumed has also reduced a bit, it’s still too high.
+# Turns out copying a matrix from CUDA to CPU is pretty expensive!
+# The ``aten::copy_`` operator in ``forward (12)`` copies ``mask`` to CPU
+# so that it can use the NumPy ``argwhere`` function. ``aten::copy_`` at ``forward(13)``
+# copies the array back to CUDA as a tensor. We could eliminate both of these if we use a
+# ``torch`` function ``nonzero()`` here instead.
+#
+
+class MyModule(nn.Module):
+    def __init__(self, in_features: int, out_features: int, bias: bool = True):
+        super(MyModule, self).__init__()
+        self.linear = nn.Linear(in_features, out_features, bias)
+
+    def forward(self, input, mask):
+        with profiler.record_function("LINEAR PASS"):
+            out = self.linear(input)
+
+        with profiler.record_function("MASK INDICES"):
+            threshold = out.sum(axis=1).mean()
+            hi_idx = (mask > threshold).nonzero(as_tuple=True)
+
+        return out, hi_idx
+
+
+model = MyModule(500, 10).cuda()
+input = torch.rand(128, 500).cuda()
+mask = torch.rand((500, 500, 500), dtype=torch.float).cuda()
+
+# warm-up
+model(input, mask)
+
+with profiler.profile(with_stack=True, profile_memory=True) as prof:
+    out, idx = model(input, mask)
+
+print(prof.key_averages(group_by_stack_n=5).table(sort_by='self_cpu_time_total', row_limit=5))
+
+"""
+(Some columns are omitted)
+
+--------------  ------------  ------------  ------------  ---------------------------------
+          Name    Self CPU %      Self CPU  Self CPU Mem   Source Location
+--------------  ------------  ------------  ------------  ---------------------------------
+      aten::gt        57.17%     129.089ms           0 b  <ipython-input-...>(12): forward
+                                                          /mnt/xarfuse/.../torch/nn
+                                                          <ipython-input-...>(25): <module>
+                                                          /mnt/xarfuse/.../IPython/
+                                                          /mnt/xarfuse/.../IPython/
+
+ aten::nonzero        37.38%      84.402ms           0 b  <ipython-input-...>(12): forward
+                                                          /mnt/xarfuse/.../torch/nn
+                                                          <ipython-input-...>(25): <module>
+                                                          /mnt/xarfuse/.../IPython/
+                                                          /mnt/xarfuse/.../IPython/
+
+   INDEX SCORE         3.32%       7.491ms    -119.21 Mb  /mnt/xarfuse/.../torch/au
+                                                          <ipython-input-...>(10): forward
+                                                          /mnt/xarfuse/.../torch/nn
+                                                          <ipython-input-...>(25): <module>
+                                                          /mnt/xarfuse/.../IPython/
+
+aten::as_strided         0.20%    441.587us          0 b  <ipython-input-...>(12): forward
+                                                          /mnt/xarfuse/.../torch/nn
+                                                          <ipython-input-...>(25): <module>
+                                                          /mnt/xarfuse/.../IPython/
+                                                          /mnt/xarfuse/.../IPython/
+
+ aten::nonzero
+     _numpy             0.18%     395.602us           0 b  <ipython-input-...>(12): forward
+                                                          /mnt/xarfuse/.../torch/nn
+                                                          <ipython-input-...>(25): <module>
+                                                          /mnt/xarfuse/.../IPython/
+                                                          /mnt/xarfuse/.../IPython/
+--------------  ------------  ------------  ------------  ---------------------------------
+Self CPU time total: 225.801ms
+
+"""
+
+
+######################################################################
+# Further Reading
+# ~~~~~~~~~~~~~~~~~
+# We have seen how Profiler can be used to investigate time and memory bottlenecks in PyTorch models.
+# Read more about Profiler here:
+#
+# - `Profiler Usage Recipe <https://pytorch.org/tutorials/recipes/recipes/profiler.html>`__
+# - `Profiling RPC-Based Workloads <https://pytorch.org/tutorials/recipes/distributed_rpc_profiling.html>`__
+# - `Profiler API Docs <https://pytorch.org/docs/stable/autograd.html?highlight=profiler#profiler>`__
diff --git a/beginner_source/pytorch_with_examples.rst b/beginner_source/pytorch_with_examples.rst
index ccd9dbe060e..d65a959b957 100644
--- a/beginner_source/pytorch_with_examples.rst
+++ b/beginner_source/pytorch_with_examples.rst
@@ -1,30 +1,42 @@
 Learning PyTorch with Examples
-******************************
-**Author**: `Justin Johnson <https://github.com/jcjohnson/pytorch-examples>`_ 
+==============================
+
+**Author**: `Justin Johnson <https://github.com/jcjohnson/pytorch-examples>`_
+
+.. note::
+   This is one of our older PyTorch tutorials. You can view our latest
+   beginner content in 
+   `Learn the Basics <https://pytorch.org/tutorials/beginner/basics/intro.html>`_.
 
 This tutorial introduces the fundamental concepts of
 `PyTorch <https://github.com/pytorch/pytorch>`__ through self-contained
 examples.
 
-At its core, PyTorch provides two main features: 
+At its core, PyTorch provides two main features:
 
-- An n-dimensional Tensor, similar to numpy but can run on GPUs 
+- An n-dimensional Tensor, similar to numpy but can run on GPUs
 - Automatic differentiation for building and training neural networks
 
-We will use a fully-connected ReLU network as our running example. The
-network will have a single hidden layer, and will be trained with
+We will use a problem of fitting :math:`y=\sin(x)` with a third order polynomial
+as our running example. The network will have four parameters, and will be trained with
 gradient descent to fit random data by minimizing the Euclidean distance
 between the network output and the true output.
 
-.. Note::
-	You can browse the individual examples at the 
-	:ref:`end of this page <examples-download>`.
+.. note::
+   You can browse the individual examples at the
+   :ref:`end of this page <examples-download>`.
+
+To run the tutorials below, make sure you have the `torch`_
+and `numpy`_ packages installed.
+
+.. _torch: https://github.com/pytorch/pytorch
+.. _numpy: https://github.com/numpy/numpy
 
 .. contents:: Table of Contents
-	:local:
+   :local:
 
 Tensors
-=======
+~~~~~~~
 
 Warm-up: numpy
 --------------
@@ -36,10 +48,10 @@ Numpy provides an n-dimensional array object, and many functions for
 manipulating these arrays. Numpy is a generic framework for scientific
 computing; it does not know anything about computation graphs, or deep
 learning, or gradients. However we can easily use numpy to fit a
-two-layer network to random data by manually implementing the forward
+third order polynomial to sine function by manually implementing the forward
 and backward passes through the network using numpy operations:
 
-.. includenodoc:: /beginner/examples_tensor/two_layer_net_numpy.py
+.. includenodoc:: /beginner/examples_tensor/polynomial_numpy.py
 
 
 PyTorch: Tensors
@@ -47,32 +59,31 @@ PyTorch: Tensors
 
 Numpy is a great framework, but it cannot utilize GPUs to accelerate its
 numerical computations. For modern deep neural networks, GPUs often
-provide speedups of `50x or
-greater <https://github.com/jcjohnson/cnn-benchmarks>`__, so
+provide speedups of `50x or greater <https://github.com/jcjohnson/cnn-benchmarks>`__, so
 unfortunately numpy won't be enough for modern deep learning.
 
 Here we introduce the most fundamental PyTorch concept: the **Tensor**.
 A PyTorch Tensor is conceptually identical to a numpy array: a Tensor is
 an n-dimensional array, and PyTorch provides many functions for
-operating on these Tensors. Like numpy arrays, PyTorch Tensors do not
-know anything about deep learning or computational graphs or gradients;
-they are a generic tool for scientific computing.
+operating on these Tensors. Behind the scenes, Tensors can keep track of
+a computational graph and gradients, but they're also useful as a
+generic tool for scientific computing.
 
-However unlike numpy, PyTorch Tensors can utilize GPUs to accelerate
+Also unlike numpy, PyTorch Tensors can utilize GPUs to accelerate
 their numeric computations. To run a PyTorch Tensor on GPU, you simply
-need to cast it to a new datatype.
+need to specify the correct device.
 
-Here we use PyTorch Tensors to fit a two-layer network to random data.
+Here we use PyTorch Tensors to fit a third order polynomial to sine function.
 Like the numpy example above we need to manually implement the forward
 and backward passes through the network:
 
-.. includenodoc:: /beginner/examples_tensor/two_layer_net_tensor.py
+.. includenodoc:: /beginner/examples_tensor/polynomial_tensor.py
 
 
 Autograd
-========
+~~~~~~~~
 
-PyTorch: Variables and autograd
+PyTorch: Tensors and autograd
 -------------------------------
 
 In the above examples, we had to manually implement both the forward and
@@ -90,22 +101,16 @@ will be functions that produce output Tensors from input Tensors.
 Backpropagating through this graph then allows you to easily compute
 gradients.
 
-This sounds complicated, it's pretty simple to use in practice. We wrap
-our PyTorch Tensors in **Variable** objects; a Variable represents a
-node in a computational graph. If ``x`` is a Variable then ``x.data`` is
-a Tensor, and ``x.grad`` is another Variable holding the gradient of
-``x`` with respect to some scalar value.
-
-PyTorch Variables have the same API as PyTorch Tensors: (almost) any
-operation that you can perform on a Tensor also works on Variables; the
-difference is that using Variables defines a computational graph,
-allowing you to automatically compute gradients.
+This sounds complicated, it's pretty simple to use in practice. Each Tensor
+represents a node in a computational graph. If ``x`` is a Tensor that has
+``x.requires_grad=True`` then ``x.grad`` is another Tensor holding the
+gradient of ``x`` with respect to some scalar value.
 
-Here we use PyTorch Variables and autograd to implement our two-layer
-network; now we no longer need to manually implement the backward pass
-through the network:
+Here we use PyTorch Tensors and autograd to implement our fitting sine wave
+with third order polynomial example; now we no longer need to manually
+implement the backward pass through the network:
 
-.. includenodoc:: /beginner/examples_autograd/two_layer_net_autograd.py
+.. includenodoc:: /beginner/examples_autograd/polynomial_autograd.py
 
 PyTorch: Defining new autograd functions
 ----------------------------------------
@@ -121,56 +126,24 @@ In PyTorch we can easily define our own autograd operator by defining a
 subclass of ``torch.autograd.Function`` and implementing the ``forward``
 and ``backward`` functions. We can then use our new autograd operator by
 constructing an instance and calling it like a function, passing
-Variables containing input data.
-
-In this example we define our own custom autograd function for
-performing the ReLU nonlinearity, and use it to implement our two-layer
-network:
-
-.. includenodoc:: /beginner/examples_autograd/two_layer_net_custom_function.py
-
-TensorFlow: Static Graphs
--------------------------
+Tensors containing input data.
 
-PyTorch autograd looks a lot like TensorFlow: in both frameworks we
-define a computational graph, and use automatic differentiation to
-compute gradients. The biggest difference between the two is that
-TensorFlow's computational graphs are **static** and PyTorch uses
-**dynamic** computational graphs.
+In this example we define our model as :math:`y=a+b P_3(c+dx)` instead of
+:math:`y=a+bx+cx^2+dx^3`, where :math:`P_3(x)=\frac{1}{2}\left(5x^3-3x\right)`
+is the `Legendre polynomial`_ of degree three. We write our own custom autograd
+function for computing forward and backward of :math:`P_3`, and use it to implement
+our model:
 
-In TensorFlow, we define the computational graph once and then execute
-the same graph over and over again, possibly feeding different input
-data to the graph. In PyTorch, each forward pass defines a new
-computational graph.
+.. _Legendre polynomial:
+    https://en.wikipedia.org/wiki/Legendre_polynomials
 
-Static graphs are nice because you can optimize the graph up front; for
-example a framework might decide to fuse some graph operations for
-efficiency, or to come up with a strategy for distributing the graph
-across many GPUs or many machines. If you are reusing the same graph
-over and over, then this potentially costly up-front optimization can be
-amortized as the same graph is rerun over and over.
+.. includenodoc:: /beginner/examples_autograd/polynomial_custom_function.py
 
-One aspect where static and dynamic graphs differ is control flow. For
-some models we may wish to perform different computation for each data
-point; for example a recurrent network might be unrolled for different
-numbers of time steps for each data point; this unrolling can be
-implemented as a loop. With a static graph the loop construct needs to
-be a part of the graph; for this reason TensorFlow provides operators
-such as ``tf.scan`` for embedding loops into the graph. With dynamic
-graphs the situation is simpler: since we build graphs on-the-fly for
-each example, we can use normal imperative flow control to perform
-computation that differs for each input.
+``nn`` module
+~~~~~~~~~~~~~
 
-To contrast with the PyTorch autograd example above, here we use
-TensorFlow to fit a simple two-layer net:
-
-.. includenodoc:: /beginner/examples_autograd/tf_two_layer_net.py
-
-`nn` module
-===========
-
-PyTorch: nn
------------
+PyTorch: ``nn``
+---------------
 
 Computational graphs and autograd are a very powerful paradigm for
 defining complex operators and automatically taking derivatives; however
@@ -182,80 +155,80 @@ which will be optimized during learning.
 
 In TensorFlow, packages like
 `Keras <https://github.com/fchollet/keras>`__,
-`TensorFlow-Slim <https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/slim>`__,
+`TensorFlow-Slim <https://github.com/google-research/tf-slim>`__,
 and `TFLearn <http://tflearn.org/>`__ provide higher-level abstractions
 over raw computational graphs that are useful for building neural
 networks.
 
 In PyTorch, the ``nn`` package serves this same purpose. The ``nn``
 package defines a set of **Modules**, which are roughly equivalent to
-neural network layers. A Module receives input Variables and computes
-output Variables, but may also hold internal state such as Variables
+neural network layers. A Module receives input Tensors and computes
+output Tensors, but may also hold internal state such as Tensors
 containing learnable parameters. The ``nn`` package also defines a set
 of useful loss functions that are commonly used when training neural
 networks.
 
-In this example we use the ``nn`` package to implement our two-layer
+In this example we use the ``nn`` package to implement our polynomial model
 network:
 
-.. includenodoc:: /beginner/examples_nn/two_layer_net_nn.py
+.. includenodoc:: /beginner/examples_nn/polynomial_nn.py
 
 PyTorch: optim
 --------------
 
 Up to this point we have updated the weights of our models by manually
-mutating the ``.data`` member for Variables holding learnable
-parameters. This is not a huge burden for simple optimization algorithms
-like stochastic gradient descent, but in practice we often train neural
-networks using more sophisiticated optimizers like AdaGrad, RMSProp,
-Adam, etc.
+mutating the Tensors holding learnable parameters with ``torch.no_grad()``.
+This is not a huge burden for simple optimization algorithms like stochastic
+gradient descent, but in practice we often train neural networks using more
+sophisticated optimizers like ``AdaGrad``, ``RMSProp``, ``Adam``, and other.
 
 The ``optim`` package in PyTorch abstracts the idea of an optimization
 algorithm and provides implementations of commonly used optimization
 algorithms.
 
 In this example we will use the ``nn`` package to define our model as
-before, but we will optimize the model using the Adam algorithm provided
+before, but we will optimize the model using the ``RMSprop`` algorithm provided
 by the ``optim`` package:
 
-.. includenodoc:: /beginner/examples_nn/two_layer_net_optim.py
+.. includenodoc:: /beginner/examples_nn/polynomial_optim.py
 
-PyTorch: Custom nn Modules
---------------------------
+PyTorch: Custom ``nn`` Modules
+------------------------------
 
 Sometimes you will want to specify models that are more complex than a
 sequence of existing Modules; for these cases you can define your own
 Modules by subclassing ``nn.Module`` and defining a ``forward`` which
-receives input Variables and produces output Variables using other
-modules or other autograd operations on Variables.
+receives input Tensors and produces output Tensors using other
+modules or other autograd operations on Tensors.
 
-In this example we implement our two-layer network as a custom Module
+In this example we implement our third order polynomial as a custom Module
 subclass:
 
-.. includenodoc:: /beginner/examples_nn/two_layer_net_module.py
+.. includenodoc:: /beginner/examples_nn/polynomial_module.py
 
 PyTorch: Control Flow + Weight Sharing
 --------------------------------------
 
 As an example of dynamic graphs and weight sharing, we implement a very
-strange model: a fully-connected ReLU network that on each forward pass
-chooses a random number between 1 and 4 and uses that many hidden
-layers, reusing the same weights multiple times to compute the innermost
-hidden layers.
+strange model: a third-fifth order polynomial that on each forward pass
+chooses a random number between 3 and 5 and uses that many orders, reusing
+the same weights multiple times to compute the fourth and fifth order.
 
-For this model can use normal Python flow control to implement the loop,
-and we can implement weight sharing among the innermost layers by simply
-reusing the same Module multiple times when defining the forward pass.
+For this model we can use normal Python flow control to implement the loop,
+and we can implement weight sharing by simply reusing the same parameter multiple
+times when defining the forward pass.
 
 We can easily implement this model as a Module subclass:
 
 .. includenodoc:: /beginner/examples_nn/dynamic_net.py
 
 
+
+
 .. _examples-download:
 
 Examples
-========
+~~~~~~~~
 
 You can browse the above examples here.
 
@@ -264,14 +237,13 @@ Tensors
 
 .. toctree::
    :maxdepth: 2
-   :hidden:
 
-   /beginner/examples_tensor/two_layer_net_numpy
-   /beginner/examples_tensor/two_layer_net_tensor
+   /beginner/examples_tensor/polynomial_numpy
+   /beginner/examples_tensor/polynomial_tensor
 
-.. galleryitem:: /beginner/examples_tensor/two_layer_net_numpy.py
+.. galleryitem:: /beginner/examples_tensor/polynomial_numpy.py
 
-.. galleryitem:: /beginner/examples_tensor/two_layer_net_tensor.py
+.. galleryitem:: /beginner/examples_tensor/polynomial_tensor.py
 
 .. raw:: html
 
@@ -282,44 +254,39 @@ Autograd
 
 .. toctree::
    :maxdepth: 2
-   :hidden:
-
-   /beginner/examples_autograd/two_layer_net_autograd
-   /beginner/examples_autograd/two_layer_net_custom_function
-   /beginner/examples_autograd/tf_two_layer_net
 
+   /beginner/examples_autograd/polynomial_autograd
+   /beginner/examples_autograd/polynomial_custom_function
 
-.. galleryitem:: /beginner/examples_autograd/two_layer_net_autograd.py
 
-.. galleryitem:: /beginner/examples_autograd/two_layer_net_custom_function.py
+.. galleryitem:: /beginner/examples_autograd/polynomial_autograd.py
 
-.. galleryitem:: /beginner/examples_autograd/tf_two_layer_net.py
+.. galleryitem:: /beginner/examples_autograd/polynomial_custom_function.py
 
 .. raw:: html
 
     <div style='clear:both'></div>
 
-`nn` module
------------
+``nn`` module
+--------------
 
 .. toctree::
    :maxdepth: 2
-   :hidden:
 
-   /beginner/examples_nn/two_layer_net_nn
-   /beginner/examples_nn/two_layer_net_optim
-   /beginner/examples_nn/two_layer_net_module
+   /beginner/examples_nn/polynomial_nn
+   /beginner/examples_nn/polynomial_optim
+   /beginner/examples_nn/polynomial_module
    /beginner/examples_nn/dynamic_net
 
 
-.. galleryitem:: /beginner/examples_nn/two_layer_net_nn.py
+.. galleryitem:: /beginner/examples_nn/polynomial_nn.py
 
-.. galleryitem:: /beginner/examples_nn/two_layer_net_optim.py
+.. galleryitem:: /beginner/examples_nn/polynomial_optim.py
 
-.. galleryitem:: /beginner/examples_nn/two_layer_net_module.py
+.. galleryitem:: /beginner/examples_nn/polynomial_module.py
 
 .. galleryitem:: /beginner/examples_nn/dynamic_net.py
 
 .. raw:: html
 
-    <div style='clear:both'></div>
+    <div style='clear:both'></div>
\ No newline at end of file
diff --git a/beginner_source/saving_loading_models.py b/beginner_source/saving_loading_models.py
new file mode 100644
index 00000000000..d09f9ca4491
--- /dev/null
+++ b/beginner_source/saving_loading_models.py
@@ -0,0 +1,521 @@
+# -*- coding: utf-8 -*-
+"""
+Saving and Loading Models
+=========================
+**Author:** `Matthew Inkawhich <https://github.com/MatthewInkawhich>`_
+
+This document provides solutions to a variety of use cases regarding the
+saving and loading of PyTorch models. Feel free to read the whole
+document, or just skip to the code you need for a desired use case.
+
+When it comes to saving and loading models, there are three core
+functions to be familiar with:
+
+1) `torch.save <https://pytorch.org/docs/stable/torch.html?highlight=save#torch.save>`__:
+   Saves a serialized object to disk. This function uses Python’s
+   `pickle <https://docs.python.org/3/library/pickle.html>`__ utility
+   for serialization. Models, tensors, and dictionaries of all kinds of
+   objects can be saved using this function.
+
+2) `torch.load <https://pytorch.org/docs/stable/torch.html?highlight=torch%20load#torch.load>`__:
+   Uses `pickle <https://docs.python.org/3/library/pickle.html>`__\ ’s
+   unpickling facilities to deserialize pickled object files to memory.
+   This function also facilitates the device to load the data into (see
+   `Saving & Loading Model Across
+   Devices <#saving-loading-model-across-devices>`__).
+
+3) `torch.nn.Module.load_state_dict <https://pytorch.org/docs/stable/generated/torch.nn.Module.html?highlight=load_state_dict#torch.nn.Module.load_state_dict>`__:
+   Loads a model’s parameter dictionary using a deserialized
+   *state_dict*. For more information on *state_dict*, see `What is a
+   state_dict? <#what-is-a-state-dict>`__.
+
+
+
+**Contents:**
+
+-  `What is a state_dict? <#what-is-a-state-dict>`__
+-  `Saving & Loading Model for
+   Inference <#saving-loading-model-for-inference>`__
+-  `Saving & Loading a General
+   Checkpoint <#saving-loading-a-general-checkpoint-for-inference-and-or-resuming-training>`__
+-  `Saving Multiple Models in One
+   File <#saving-multiple-models-in-one-file>`__
+-  `Warmstarting Model Using Parameters from a Different
+   Model <#warmstarting-model-using-parameters-from-a-different-model>`__
+-  `Saving & Loading Model Across
+   Devices <#saving-loading-model-across-devices>`__
+
+"""
+
+
+######################################################################
+# What is a ``state_dict``?
+# -------------------------
+#
+# In PyTorch, the learnable parameters (i.e. weights and biases) of an
+# ``torch.nn.Module`` model are contained in the model’s *parameters*
+# (accessed with ``model.parameters()``). A *state_dict* is simply a
+# Python dictionary object that maps each layer to its parameter tensor.
+# Note that only layers with learnable parameters (convolutional layers,
+# linear layers, etc.) and registered buffers (batchnorm's running_mean)
+# have entries in the model’s *state_dict*. Optimizer
+# objects (``torch.optim``) also have a *state_dict*, which contains
+# information about the optimizer's state, as well as the hyperparameters
+# used.
+#
+# Because *state_dict* objects are Python dictionaries, they can be easily
+# saved, updated, altered, and restored, adding a great deal of modularity
+# to PyTorch models and optimizers.
+#
+# Example:
+# ^^^^^^^^
+#
+# Let’s take a look at the *state_dict* from the simple model used in the
+# `Training a
+# classifier <https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html#sphx-glr-beginner-blitz-cifar10-tutorial-py>`__
+# tutorial.
+#
+# .. code:: python
+#
+#    # Define model
+#    class TheModelClass(nn.Module):
+#        def __init__(self):
+#            super(TheModelClass, self).__init__()
+#            self.conv1 = nn.Conv2d(3, 6, 5)
+#            self.pool = nn.MaxPool2d(2, 2)
+#            self.conv2 = nn.Conv2d(6, 16, 5)
+#            self.fc1 = nn.Linear(16 * 5 * 5, 120)
+#            self.fc2 = nn.Linear(120, 84)
+#            self.fc3 = nn.Linear(84, 10)
+#
+#        def forward(self, x):
+#            x = self.pool(F.relu(self.conv1(x)))
+#            x = self.pool(F.relu(self.conv2(x)))
+#            x = x.view(-1, 16 * 5 * 5)
+#            x = F.relu(self.fc1(x))
+#            x = F.relu(self.fc2(x))
+#            x = self.fc3(x)
+#            return x
+#
+#    # Initialize model
+#    model = TheModelClass()
+#
+#    # Initialize optimizer
+#    optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
+#
+#    # Print model's state_dict
+#    print("Model's state_dict:")
+#    for param_tensor in model.state_dict():
+#        print(param_tensor, "\t", model.state_dict()[param_tensor].size())
+#
+#    # Print optimizer's state_dict
+#    print("Optimizer's state_dict:")
+#    for var_name in optimizer.state_dict():
+#        print(var_name, "\t", optimizer.state_dict()[var_name])
+#
+# **Output:**
+#
+# .. code-block:: sh
+#
+#    Model's state_dict:
+#    conv1.weight     torch.Size([6, 3, 5, 5])
+#    conv1.bias   torch.Size([6])
+#    conv2.weight     torch.Size([16, 6, 5, 5])
+#    conv2.bias   torch.Size([16])
+#    fc1.weight   torch.Size([120, 400])
+#    fc1.bias     torch.Size([120])
+#    fc2.weight   torch.Size([84, 120])
+#    fc2.bias     torch.Size([84])
+#    fc3.weight   torch.Size([10, 84])
+#    fc3.bias     torch.Size([10])
+#
+#    Optimizer's state_dict:
+#    state    {}
+#    param_groups     [{'lr': 0.001, 'momentum': 0.9, 'dampening': 0, 'weight_decay': 0, 'nesterov': False, 'params': [4675713712, 4675713784, 4675714000, 4675714072, 4675714216, 4675714288, 4675714432, 4675714504, 4675714648, 4675714720]}]
+#
+
+
+######################################################################
+# Saving & Loading Model for Inference
+# ------------------------------------
+#
+# Save/Load ``state_dict`` (Recommended)
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+#
+# **Save:**
+#
+# .. code:: python
+#
+#    torch.save(model.state_dict(), PATH)
+#
+# **Load:**
+#
+# .. code:: python
+#
+#    model = TheModelClass(*args, **kwargs)
+#    model.load_state_dict(torch.load(PATH, weights_only=True))
+#    model.eval()
+#
+# .. note::
+#     The 1.6 release of PyTorch switched ``torch.save`` to use a new
+#     zip file-based format. ``torch.load`` still retains the ability to
+#     load files in the old format. If for any reason you want ``torch.save``
+#     to use the old format, pass the ``kwarg`` parameter ``_use_new_zipfile_serialization=False``.
+#
+# When saving a model for inference, it is only necessary to save the
+# trained model’s learned parameters. Saving the model’s *state_dict* with
+# the ``torch.save()`` function will give you the most flexibility for
+# restoring the model later, which is why it is the recommended method for
+# saving models.
+#
+# A common PyTorch convention is to save models using either a ``.pt`` or
+# ``.pth`` file extension.
+#
+# Remember that you must call ``model.eval()`` to set dropout and batch
+# normalization layers to evaluation mode before running inference.
+# Failing to do this will yield inconsistent inference results.
+#
+# .. note::
+#
+#    Notice that the ``load_state_dict()`` function takes a dictionary
+#    object, NOT a path to a saved object. This means that you must
+#    deserialize the saved *state_dict* before you pass it to the
+#    ``load_state_dict()`` function. For example, you CANNOT load using
+#    ``model.load_state_dict(PATH)``.
+#
+# .. note::
+#    
+#    If you only plan to keep the best performing model (according to the 
+#    acquired validation loss), don't forget that ``best_model_state = model.state_dict()``
+#    returns a reference to the state and not its copy! You must serialize 
+#    ``best_model_state`` or use ``best_model_state = deepcopy(model.state_dict())`` otherwise
+#    your best ``best_model_state`` will keep getting updated by the subsequent training 
+#    iterations. As a result, the final model state will be the state of the overfitted model. 
+#
+# Save/Load Entire Model
+# ^^^^^^^^^^^^^^^^^^^^^^
+#
+# **Save:**
+#
+# .. code:: python
+#
+#    torch.save(model, PATH)
+#
+# **Load:**
+#
+# .. code:: python
+#
+#    # Model class must be defined somewhere
+#    model = torch.load(PATH, weights_only=False)
+#    model.eval()
+#
+# This save/load process uses the most intuitive syntax and involves the
+# least amount of code. Saving a model in this way will save the entire
+# module using Python’s
+# `pickle <https://docs.python.org/3/library/pickle.html>`__ module. The
+# disadvantage of this approach is that the serialized data is bound to
+# the specific classes and the exact directory structure used when the
+# model is saved. The reason for this is because pickle does not save the
+# model class itself. Rather, it saves a path to the file containing the
+# class, which is used during load time. Because of this, your code can
+# break in various ways when used in other projects or after refactors.
+#
+# A common PyTorch convention is to save models using either a ``.pt`` or
+# ``.pth`` file extension.
+#
+# Remember that you must call ``model.eval()`` to set dropout and batch
+# normalization layers to evaluation mode before running inference.
+# Failing to do this will yield inconsistent inference results.
+#
+# Saving an Exported Program
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# If you are using ``torch.export``, you can save and load your ``ExportedProgram`` using the
+# ``torch.export.save()`` and ``torch.export.load()`` APIs. with the ``.pt2`` file extension:
+#
+# .. code-block:: python
+# 
+#    class SimpleModel(torch.nn.Module):
+#         def forward(self, x):
+#             return x + 10
+#
+#    # Create a sample input
+#    sample_input = torch.randn(5)
+# 
+#    # Export the model
+#    exported_program = torch.export.export(SimpleModel(), sample_input)
+#
+#    # Save the exported program
+#    torch.export.save(exported_program, 'exported_program.pt2')
+#
+#    # Load the exported program
+#    saved_exported_program = torch.export.load('exported_program.pt2')
+#
+
+######################################################################
+# Saving & Loading a General Checkpoint for Inference and/or Resuming Training
+# ----------------------------------------------------------------------------
+#
+# Save:
+# ^^^^^
+#
+# .. code:: python
+#
+#    torch.save({
+#                'epoch': epoch,
+#                'model_state_dict': model.state_dict(),
+#                'optimizer_state_dict': optimizer.state_dict(),
+#                'loss': loss,
+#                ...
+#                }, PATH)
+#
+# Load:
+# ^^^^^
+#
+# .. code:: python
+#
+#    model = TheModelClass(*args, **kwargs)
+#    optimizer = TheOptimizerClass(*args, **kwargs)
+#
+#    checkpoint = torch.load(PATH, weights_only=True)
+#    model.load_state_dict(checkpoint['model_state_dict'])
+#    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
+#    epoch = checkpoint['epoch']
+#    loss = checkpoint['loss']
+#
+#    model.eval()
+#    # - or -
+#    model.train()
+#
+# When saving a general checkpoint, to be used for either inference or
+# resuming training, you must save more than just the model’s
+# *state_dict*. It is important to also save the optimizer's *state_dict*,
+# as this contains buffers and parameters that are updated as the model
+# trains. Other items that you may want to save are the epoch you left off
+# on, the latest recorded training loss, external ``torch.nn.Embedding``
+# layers, etc. As a result, such a checkpoint is often 2~3 times larger 
+# than the model alone.
+#
+# To save multiple components, organize them in a dictionary and use
+# ``torch.save()`` to serialize the dictionary. A common PyTorch
+# convention is to save these checkpoints using the ``.tar`` file
+# extension.
+#
+# To load the items, first initialize the model and optimizer, then load
+# the dictionary locally using ``torch.load()``. From here, you can easily
+# access the saved items by simply querying the dictionary as you would
+# expect.
+#
+# Remember that you must call ``model.eval()`` to set dropout and batch
+# normalization layers to evaluation mode before running inference.
+# Failing to do this will yield inconsistent inference results. If you
+# wish to resuming training, call ``model.train()`` to ensure these layers
+# are in training mode.
+#
+
+
+######################################################################
+# Saving Multiple Models in One File
+# ----------------------------------
+#
+# Save:
+# ^^^^^
+#
+# .. code:: python
+#
+#    torch.save({
+#                'modelA_state_dict': modelA.state_dict(),
+#                'modelB_state_dict': modelB.state_dict(),
+#                'optimizerA_state_dict': optimizerA.state_dict(),
+#                'optimizerB_state_dict': optimizerB.state_dict(),
+#                ...
+#                }, PATH)
+#
+# Load:
+# ^^^^^
+#
+# .. code:: python
+#
+#    modelA = TheModelAClass(*args, **kwargs)
+#    modelB = TheModelBClass(*args, **kwargs)
+#    optimizerA = TheOptimizerAClass(*args, **kwargs)
+#    optimizerB = TheOptimizerBClass(*args, **kwargs)
+#
+#    checkpoint = torch.load(PATH, weights_only=True)
+#    modelA.load_state_dict(checkpoint['modelA_state_dict'])
+#    modelB.load_state_dict(checkpoint['modelB_state_dict'])
+#    optimizerA.load_state_dict(checkpoint['optimizerA_state_dict'])
+#    optimizerB.load_state_dict(checkpoint['optimizerB_state_dict'])
+#
+#    modelA.eval()
+#    modelB.eval()
+#    # - or -
+#    modelA.train()
+#    modelB.train()
+#
+# When saving a model comprised of multiple ``torch.nn.Modules``, such as
+# a GAN, a sequence-to-sequence model, or an ensemble of models, you
+# follow the same approach as when you are saving a general checkpoint. In
+# other words, save a dictionary of each model’s *state_dict* and
+# corresponding optimizer. As mentioned before, you can save any other
+# items that may aid you in resuming training by simply appending them to
+# the dictionary.
+#
+# A common PyTorch convention is to save these checkpoints using the
+# ``.tar`` file extension.
+#
+# To load the models, first initialize the models and optimizers, then
+# load the dictionary locally using ``torch.load()``. From here, you can
+# easily access the saved items by simply querying the dictionary as you
+# would expect.
+#
+# Remember that you must call ``model.eval()`` to set dropout and batch
+# normalization layers to evaluation mode before running inference.
+# Failing to do this will yield inconsistent inference results. If you
+# wish to resuming training, call ``model.train()`` to set these layers to
+# training mode.
+#
+
+
+######################################################################
+# Warmstarting Model Using Parameters from a Different Model
+# ----------------------------------------------------------
+#
+# Save:
+# ^^^^^
+#
+# .. code:: python
+#
+#    torch.save(modelA.state_dict(), PATH)
+#
+# Load:
+# ^^^^^
+#
+# .. code:: python
+#
+#    modelB = TheModelBClass(*args, **kwargs)
+#    modelB.load_state_dict(torch.load(PATH, weights_only=True), strict=False)
+#
+# Partially loading a model or loading a partial model are common
+# scenarios when transfer learning or training a new complex model.
+# Leveraging trained parameters, even if only a few are usable, will help
+# to warmstart the training process and hopefully help your model converge
+# much faster than training from scratch.
+#
+# Whether you are loading from a partial *state_dict*, which is missing
+# some keys, or loading a *state_dict* with more keys than the model that
+# you are loading into, you can set the ``strict`` argument to **False**
+# in the ``load_state_dict()`` function to ignore non-matching keys.
+#
+# If you want to load parameters from one layer to another, but some keys
+# do not match, simply change the name of the parameter keys in the
+# *state_dict* that you are loading to match the keys in the model that
+# you are loading into.
+#
+
+
+######################################################################
+# Saving & Loading Model Across Devices
+# -------------------------------------
+#
+# Save on GPU, Load on CPU
+# ^^^^^^^^^^^^^^^^^^^^^^^^
+#
+# **Save:**
+#
+# .. code:: python
+#
+#    torch.save(model.state_dict(), PATH)
+#
+# **Load:**
+#
+# .. code:: python
+#
+#    device = torch.device('cpu')
+#    model = TheModelClass(*args, **kwargs)
+#    model.load_state_dict(torch.load(PATH, map_location=device, weights_only=True))
+#
+# When loading a model on a CPU that was trained with a GPU, pass
+# ``torch.device('cpu')`` to the ``map_location`` argument in the
+# ``torch.load()`` function. In this case, the storages underlying the
+# tensors are dynamically remapped to the CPU device using the
+# ``map_location`` argument.
+#
+# Save on GPU, Load on GPU
+# ^^^^^^^^^^^^^^^^^^^^^^^^
+#
+# **Save:**
+#
+# .. code:: python
+#
+#    torch.save(model.state_dict(), PATH)
+#
+# **Load:**
+#
+# .. code:: python
+#
+#    device = torch.device("cuda")
+#    model = TheModelClass(*args, **kwargs)
+#    model.load_state_dict(torch.load(PATH, weights_only=True))
+#    model.to(device)
+#    # Make sure to call input = input.to(device) on any input tensors that you feed to the model
+#
+# When loading a model on a GPU that was trained and saved on GPU, simply
+# convert the initialized ``model`` to a CUDA optimized model using
+# ``model.to(torch.device('cuda'))``. Also, be sure to use the
+# ``.to(torch.device('cuda'))`` function on all model inputs to prepare
+# the data for the model. Note that calling ``my_tensor.to(device)``
+# returns a new copy of ``my_tensor`` on GPU. It does NOT overwrite
+# ``my_tensor``. Therefore, remember to manually overwrite tensors:
+# ``my_tensor = my_tensor.to(torch.device('cuda'))``.
+#
+# Save on CPU, Load on GPU
+# ^^^^^^^^^^^^^^^^^^^^^^^^
+#
+# **Save:**
+#
+# .. code:: python
+#
+#    torch.save(model.state_dict(), PATH)
+#
+# **Load:**
+#
+# .. code:: python
+#
+#    device = torch.device("cuda")
+#    model = TheModelClass(*args, **kwargs)
+#    model.load_state_dict(torch.load(PATH, weights_only=True, map_location="cuda:0"))  # Choose whatever GPU device number you want
+#    model.to(device)
+#    # Make sure to call input = input.to(device) on any input tensors that you feed to the model
+#
+# When loading a model on a GPU that was trained and saved on CPU, set the
+# ``map_location`` argument in the ``torch.load()`` function to
+# ``cuda:device_id``. This loads the model to a given GPU device. Next, be
+# sure to call ``model.to(torch.device('cuda'))`` to convert the model’s
+# parameter tensors to CUDA tensors. Finally, be sure to use the
+# ``.to(torch.device('cuda'))`` function on all model inputs to prepare
+# the data for the CUDA optimized model. Note that calling
+# ``my_tensor.to(device)`` returns a new copy of ``my_tensor`` on GPU. It
+# does NOT overwrite ``my_tensor``. Therefore, remember to manually
+# overwrite tensors: ``my_tensor = my_tensor.to(torch.device('cuda'))``.
+#
+# Saving ``torch.nn.DataParallel`` Models
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+#
+# **Save:**
+#
+# .. code:: python
+#
+#    torch.save(model.module.state_dict(), PATH)
+#
+# **Load:**
+#
+# .. code:: python
+#
+#    # Load to whatever device you want
+#
+# ``torch.nn.DataParallel`` is a model wrapper that enables parallel GPU
+# utilization. To save a ``DataParallel`` model generically, save the
+# ``model.module.state_dict()``. This way, you have the flexibility to
+# load the model any way you want to any device you want.
+#
diff --git a/beginner_source/t5_tutoria.rst b/beginner_source/t5_tutoria.rst
new file mode 100644
index 00000000000..65de42b9320
--- /dev/null
+++ b/beginner_source/t5_tutoria.rst
@@ -0,0 +1,10 @@
+T5-Base Model for Summarization, Sentiment Classification, and Translation
+==========================================================================
+
+This tutorial has been deprecated.
+
+Redirecting in 3 seconds...
+
+.. raw:: html
+
+   <meta http-equiv="Refresh" content="3; url='https://pytorch.org/tutorials/'" />
diff --git a/beginner_source/template_tutorial.py b/beginner_source/template_tutorial.py
new file mode 100644
index 00000000000..d7fae7c4c5e
--- /dev/null
+++ b/beginner_source/template_tutorial.py
@@ -0,0 +1,94 @@
+# -*- coding: utf-8 -*-
+
+"""
+Template Tutorial
+=================
+
+**Author:** `FirstName LastName <https://github.com/username>`_
+
+.. grid:: 2
+
+    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn
+       :class-card: card-prerequisites
+
+       * Item 1
+       * Item 2
+       * Item 3
+
+    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites
+       :class-card: card-prerequisites
+
+       * PyTorch v2.0.0
+       * GPU ???
+       * Other items 3
+
+If you have a video, add it here like this:
+
+.. raw:: html
+
+   <div style="margin-top:10px; margin-bottom:10px;">
+     <iframe width="560" height="315" src="https://www.youtube.com/embed/IC0_FRiX-sw" frameborder="0" allow="accelerometer; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
+   </div>
+
+To test your tutorial locally, you can do one of the following:
+
+*  You can control specific files that generate the results by using
+   ``GALLERY_PATTERN`` environment variable. The GALLERY_PATTERN variable
+   respects regular expressions.
+   For example to run only ``neural_style_transfer_tutorial.py``,
+   use the following command:
+
+   .. code-block:: sh
+
+      GALLERY_PATTERN="neural_style_transfer_tutorial.py" make html
+
+   or
+
+   .. code-block:: sh
+
+      GALLERY_PATTERN="neural_style_transfer_tutorial.py" sphinx-build . _build
+
+* Make a copy of this repository and add only your
+  tutorial to the `beginner_source` directory removing all other tutorials.
+  Then run ``make html``.
+  
+Verify that all outputs were generated correctly in the created HTML.
+"""
+
+#########################################################################
+# Overview
+# --------
+#
+# Describe Why is this topic important? Add Links to relevant research papers.
+#
+# This tutorial walks you through the process of....
+#
+# Steps
+# -----
+#
+# Example code (the output below is generated automatically):
+# 
+import torch
+x = torch.rand(5, 3)
+print(x)
+
+######################################################################
+# (Optional) Additional Exercises
+# -------------------------------
+#
+# Add additional practice exercises for users to test their knowledge.
+# Example: `NLP from Scratch <https://pytorch.org/tutorials/intermediate/char_rnn_generation_tutorial.html#exercises>`__.
+#
+
+######################################################################
+# Conclusion
+# ----------
+# 
+# Summarize the steps and concepts covered. Highlight key takeaways.
+#
+# Further Reading
+# ---------------
+#
+# * Link1
+# * Link2
+
diff --git a/beginner_source/text_sentiment_ngrams_tutorial.rst b/beginner_source/text_sentiment_ngrams_tutorial.rst
new file mode 100644
index 00000000000..024d04056c5
--- /dev/null
+++ b/beginner_source/text_sentiment_ngrams_tutorial.rst
@@ -0,0 +1,12 @@
+:orphan:
+
+Text classification with the torchtext library
+==============================================
+
+This tutorial has been deprecated.
+
+Redirecting in 3 seconds...
+
+.. raw:: html
+
+   <meta http-equiv="Refresh" content="3; url='https://pytorch.org/tutorials'" />
diff --git a/beginner_source/torchtext_custom_dataset_tutorial.rst b/beginner_source/torchtext_custom_dataset_tutorial.rst
new file mode 100644
index 00000000000..9f014f3ff9a
--- /dev/null
+++ b/beginner_source/torchtext_custom_dataset_tutorial.rst
@@ -0,0 +1,12 @@
+:orphan:
+
+Preprocess custom text dataset using torchtext
+==============================================
+
+This tutorial has been deprecated.
+
+Redirecting in 3 seconds...
+
+.. raw:: html
+
+   <meta http-equiv="Refresh" content="3; url='https://pytorch.org/tutorials'" />
diff --git a/beginner_source/transfer_learning_tutorial.py b/beginner_source/transfer_learning_tutorial.py
index 79199df1073..8a344d3d88a 100644
--- a/beginner_source/transfer_learning_tutorial.py
+++ b/beginner_source/transfer_learning_tutorial.py
@@ -1,14 +1,14 @@
 # -*- coding: utf-8 -*-
 """
-Transfer Learning tutorial
-==========================
+Transfer Learning for Computer Vision Tutorial
+==============================================
 **Author**: `Sasank Chilamkurthy <https://chsasank.github.io>`_
 
-In this tutorial, you will learn how to train your network using
-transfer learning. You can read more about the transfer learning at `cs231n
-notes <http://cs231n.github.io/transfer-learning/>`__
+In this tutorial, you will learn how to train a convolutional neural network for
+image classification using transfer learning. You can read more about the transfer
+learning at `cs231n notes <https://cs231n.github.io/transfer-learning/>`__
 
-Quoting this notes,
+Quoting these notes,
 
     In practice, very few people train an entire Convolutional Network
     from scratch (with random initialization), because it is relatively
@@ -18,9 +18,9 @@
     ConvNet either as an initialization or a fixed feature extractor for
     the task of interest.
 
-These two major transfer learning scenarios looks as follows:
+These two major transfer learning scenarios look as follows:
 
--  **Finetuning the convnet**: Instead of random initializaion, we
+-  **Finetuning the ConvNet**: Instead of random initialization, we
    initialize the network with a pretrained network, like the one that is
    trained on imagenet 1000 dataset. Rest of the training looks as
    usual.
@@ -33,20 +33,21 @@
 # License: BSD
 # Author: Sasank Chilamkurthy
 
-from __future__ import print_function, division
-
 import torch
 import torch.nn as nn
 import torch.optim as optim
-from torch.autograd import Variable
+from torch.optim import lr_scheduler
+import torch.backends.cudnn as cudnn
 import numpy as np
 import torchvision
 from torchvision import datasets, models, transforms
 import matplotlib.pyplot as plt
 import time
-import copy
 import os
+from PIL import Image
+from tempfile import TemporaryDirectory
 
+cudnn.benchmark = True
 plt.ion()   # interactive mode
 
 ######################################################################
@@ -64,40 +65,44 @@
 # well.
 #
 # This dataset is a very small subset of imagenet.
-# 
+#
 # .. Note ::
 #    Download the data from
 #    `here <https://download.pytorch.org/tutorial/hymenoptera_data.zip>`_
 #    and extract it to the current directory.
 
-# Data augmentation and normalization for training 
+# Data augmentation and normalization for training
 # Just normalization for validation
 data_transforms = {
     'train': transforms.Compose([
-        transforms.RandomSizedCrop(224),
+        transforms.RandomResizedCrop(224),
         transforms.RandomHorizontalFlip(),
         transforms.ToTensor(),
         transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
     ]),
     'val': transforms.Compose([
-        transforms.Scale(256),
+        transforms.Resize(256),
         transforms.CenterCrop(224),
         transforms.ToTensor(),
         transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
     ]),
 }
 
-data_dir = 'hymenoptera_data'
-dsets = {x: datasets.ImageFolder(os.path.join(data_dir, x), data_transforms[x])
-         for x in ['train', 'val']}
-dset_loaders = {x: torch.utils.data.DataLoader(dsets[x], batch_size=4,
-                                               shuffle=True, num_workers=4)
-                for x in ['train', 'val']}
-dset_sizes = {x: len(dsets[x]) for x in ['train', 'val']}
-dset_classes = dsets['train'].classes
+data_dir = 'data/hymenoptera_data'
+image_datasets = {x: datasets.ImageFolder(os.path.join(data_dir, x),
+                                          data_transforms[x])
+                  for x in ['train', 'val']}
+dataloaders = {x: torch.utils.data.DataLoader(image_datasets[x], batch_size=4,
+                                             shuffle=True, num_workers=4)
+              for x in ['train', 'val']}
+dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'val']}
+class_names = image_datasets['train'].classes
 
-use_gpu = torch.cuda.is_available()
+# We want to be able to train our model on an `accelerator <https://pytorch.org/docs/stable/torch.html#accelerators>`__
+# such as CUDA, MPS, MTIA, or XPU. If the current accelerator is available, we will use it. Otherwise, we use the CPU.
 
+device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu"
+print(f"Using {device} device")
 
 ######################################################################
 # Visualize a few images
@@ -106,11 +111,12 @@
 # augmentations.
 
 def imshow(inp, title=None):
-    """Imshow for Tensor."""
+    """Display image for Tensor."""
     inp = inp.numpy().transpose((1, 2, 0))
     mean = np.array([0.485, 0.456, 0.406])
     std = np.array([0.229, 0.224, 0.225])
     inp = std * inp + mean
+    inp = np.clip(inp, 0, 1)
     plt.imshow(inp)
     if title is not None:
         plt.title(title)
@@ -118,12 +124,12 @@ def imshow(inp, title=None):
 
 
 # Get a batch of training data
-inputs, classes = next(iter(dset_loaders['train']))
+inputs, classes = next(iter(dataloaders['train']))
 
 # Make a grid from batch
 out = torchvision.utils.make_grid(inputs)
 
-imshow(out, title=[dset_classes[x] for x in classes])
+imshow(out, title=[class_names[x] for x in classes])
 
 
 ######################################################################
@@ -134,98 +140,81 @@ def imshow(inp, title=None):
 # illustrate:
 #
 # -  Scheduling the learning rate
-# -  Saving (deep copying) the best model
+# -  Saving the best model
 #
-# In the following, parameter ``lr_scheduler(optimizer, epoch)``
-# is a function  which modifies ``optimizer`` so that the learning
-# rate is changed according to desired schedule.
+# In the following, parameter ``scheduler`` is an LR scheduler object from
+# ``torch.optim.lr_scheduler``.
 
-def train_model(model, criterion, optimizer, lr_scheduler, num_epochs=25):
-    since = time.time()
 
-    best_model = model
-    best_acc = 0.0
-
-    for epoch in range(num_epochs):
-        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
-        print('-' * 10)
-
-        # Each epoch has a training and validation phase
-        for phase in ['train', 'val']:
-            if phase == 'train':
-                optimizer = lr_scheduler(optimizer, epoch)
-                model.train(True)  # Set model to training mode
-            else:
-                model.train(False)  # Set model to evaluate mode
-
-            running_loss = 0.0
-            running_corrects = 0
-
-            # Iterate over data.
-            for data in dset_loaders[phase]:
-                # get the inputs
-                inputs, labels = data
-
-                # wrap them in Variable
-                if use_gpu:
-                    inputs, labels = Variable(inputs.cuda()), \
-                        Variable(labels.cuda())
-                else:
-                    inputs, labels = Variable(inputs), Variable(labels)
+def train_model(model, criterion, optimizer, scheduler, num_epochs=25):
+    since = time.time()
 
-                # zero the parameter gradients
-                optimizer.zero_grad()
+    # Create a temporary directory to save training checkpoints
+    with TemporaryDirectory() as tempdir:
+        best_model_params_path = os.path.join(tempdir, 'best_model_params.pt')
+    
+        torch.save(model.state_dict(), best_model_params_path)
+        best_acc = 0.0
 
-                # forward
-                outputs = model(inputs)
-                _, preds = torch.max(outputs.data, 1)
-                loss = criterion(outputs, labels)
+        for epoch in range(num_epochs):
+            print(f'Epoch {epoch}/{num_epochs - 1}')
+            print('-' * 10)
 
-                # backward + optimize only if in training phase
+            # Each epoch has a training and validation phase
+            for phase in ['train', 'val']:
                 if phase == 'train':
-                    loss.backward()
-                    optimizer.step()
-
-                # statistics
-                running_loss += loss.data[0]
-                running_corrects += torch.sum(preds == labels.data)
-
-            epoch_loss = running_loss / dset_sizes[phase]
-            epoch_acc = running_corrects / dset_sizes[phase]
-
-            print('{} Loss: {:.4f} Acc: {:.4f}'.format(
-                phase, epoch_loss, epoch_acc))
-
-            # deep copy the model
-            if phase == 'val' and epoch_acc > best_acc:
-                best_acc = epoch_acc
-                best_model = copy.deepcopy(model)
-
-        print()
+                    model.train()  # Set model to training mode
+                else:
+                    model.eval()   # Set model to evaluate mode
+
+                running_loss = 0.0
+                running_corrects = 0
+
+                # Iterate over data.
+                for inputs, labels in dataloaders[phase]:
+                    inputs = inputs.to(device)
+                    labels = labels.to(device)
+
+                    # zero the parameter gradients
+                    optimizer.zero_grad()
+
+                    # forward
+                    # track history if only in train
+                    with torch.set_grad_enabled(phase == 'train'):
+                        outputs = model(inputs)
+                        _, preds = torch.max(outputs, 1)
+                        loss = criterion(outputs, labels)
+
+                        # backward + optimize only if in training phase
+                        if phase == 'train':
+                            loss.backward()
+                            optimizer.step()
+
+                    # statistics
+                    running_loss += loss.item() * inputs.size(0)
+                    running_corrects += torch.sum(preds == labels.data)
+                if phase == 'train':
+                    scheduler.step()
 
-    time_elapsed = time.time() - since
-    print('Training complete in {:.0f}m {:.0f}s'.format(
-        time_elapsed // 60, time_elapsed % 60))
-    print('Best val Acc: {:4f}'.format(best_acc))
-    return best_model
+                epoch_loss = running_loss / dataset_sizes[phase]
+                epoch_acc = running_corrects.double() / dataset_sizes[phase]
 
-######################################################################
-# Learning rate scheduler
-# ^^^^^^^^^^^^^^^^^^^^^^^
-# Let's create our learning rate scheduler. We will exponentially
-# decrease the learning rate once every few epochs.
+                print(f'{phase} Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}')
 
-def exp_lr_scheduler(optimizer, epoch, init_lr=0.001, lr_decay_epoch=7):
-    """Decay learning rate by a factor of 0.1 every lr_decay_epoch epochs."""
-    lr = init_lr * (0.1**(epoch // lr_decay_epoch))
+                # deep copy the model
+                if phase == 'val' and epoch_acc > best_acc:
+                    best_acc = epoch_acc
+                    torch.save(model.state_dict(), best_model_params_path)
 
-    if epoch % lr_decay_epoch == 0:
-        print('LR is set to {}'.format(lr))
+            print()
 
-    for param_group in optimizer.param_groups:
-        param_group['lr'] = lr
+        time_elapsed = time.time() - since
+        print(f'Training complete in {time_elapsed // 60:.0f}m {time_elapsed % 60:.0f}s')
+        print(f'Best val Acc: {best_acc:4f}')
 
-    return optimizer
+        # load best model weights
+        model.load_state_dict(torch.load(best_model_params_path, weights_only=True))
+    return model
 
 
 ######################################################################
@@ -236,48 +225,54 @@ def exp_lr_scheduler(optimizer, epoch, init_lr=0.001, lr_decay_epoch=7):
 #
 
 def visualize_model(model, num_images=6):
+    was_training = model.training
+    model.eval()
     images_so_far = 0
     fig = plt.figure()
 
-    for i, data in enumerate(dset_loaders['val']):
-        inputs, labels = data
-        if use_gpu:
-            inputs, labels = Variable(inputs.cuda()), Variable(labels.cuda())
-        else:
-            inputs, labels = Variable(inputs), Variable(labels)
+    with torch.no_grad():
+        for i, (inputs, labels) in enumerate(dataloaders['val']):
+            inputs = inputs.to(device)
+            labels = labels.to(device)
 
-        outputs = model(inputs)
-        _, preds = torch.max(outputs.data, 1)
+            outputs = model(inputs)
+            _, preds = torch.max(outputs, 1)
 
-        for j in range(inputs.size()[0]):
-            images_so_far += 1
-            ax = plt.subplot(num_images//2, 2, images_so_far)
-            ax.axis('off')
-            ax.set_title('predicted: {}'.format(dset_classes[labels.data[j]]))
-            imshow(inputs.cpu().data[j])
+            for j in range(inputs.size()[0]):
+                images_so_far += 1
+                ax = plt.subplot(num_images//2, 2, images_so_far)
+                ax.axis('off')
+                ax.set_title(f'predicted: {class_names[preds[j]]}')
+                imshow(inputs.cpu().data[j])
 
-            if images_so_far == num_images:
-                return
+                if images_so_far == num_images:
+                    model.train(mode=was_training)
+                    return
+        model.train(mode=was_training)
 
 ######################################################################
-# Finetuning the convnet
+# Finetuning the ConvNet
 # ----------------------
 #
 # Load a pretrained model and reset final fully connected layer.
 #
 
-model_ft = models.resnet18(pretrained=True)
+model_ft = models.resnet18(weights='IMAGENET1K_V1')
 num_ftrs = model_ft.fc.in_features
+# Here the size of each output sample is set to 2.
+# Alternatively, it can be generalized to ``nn.Linear(num_ftrs, len(class_names))``.
 model_ft.fc = nn.Linear(num_ftrs, 2)
 
-if use_gpu:
-    model_ft = model_ft.cuda()
+model_ft = model_ft.to(device)
 
 criterion = nn.CrossEntropyLoss()
 
 # Observe that all parameters are being optimized
 optimizer_ft = optim.SGD(model_ft.parameters(), lr=0.001, momentum=0.9)
 
+# Decay LR by a factor of 0.1 every 7 epochs
+exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1)
+
 ######################################################################
 # Train and evaluate
 # ^^^^^^^^^^^^^^^^^^
@@ -300,14 +295,14 @@ def visualize_model(model, num_images=6):
 # ----------------------------------
 #
 # Here, we need to freeze all the network except the final layer. We need
-# to set ``requires_grad == False`` to freeze the parameters so that the
+# to set ``requires_grad = False`` to freeze the parameters so that the
 # gradients are not computed in ``backward()``.
 #
 # You can read more about this in the documentation
-# `here <http://pytorch.org/docs/notes/autograd.html#excluding-subgraphs-from-backward>`__.
+# `here <https://pytorch.org/docs/notes/autograd.html#excluding-subgraphs-from-backward>`__.
 #
 
-model_conv = torchvision.models.resnet18(pretrained=True)
+model_conv = torchvision.models.resnet18(weights='IMAGENET1K_V1')
 for param in model_conv.parameters():
     param.requires_grad = False
 
@@ -315,15 +310,17 @@ def visualize_model(model, num_images=6):
 num_ftrs = model_conv.fc.in_features
 model_conv.fc = nn.Linear(num_ftrs, 2)
 
-if use_gpu:
-    model_conv = model_conv.cuda()
+model_conv = model_conv.to(device)
 
 criterion = nn.CrossEntropyLoss()
 
 # Observe that only parameters of final layer are being optimized as
-# opoosed to before.
+# opposed to before.
 optimizer_conv = optim.SGD(model_conv.fc.parameters(), lr=0.001, momentum=0.9)
 
+# Decay LR by a factor of 0.1 every 7 epochs
+exp_lr_scheduler = lr_scheduler.StepLR(optimizer_conv, step_size=7, gamma=0.1)
+
 
 ######################################################################
 # Train and evaluate
@@ -344,3 +341,54 @@ def visualize_model(model, num_images=6):
 
 plt.ioff()
 plt.show()
+
+
+######################################################################
+# Inference on custom images
+# --------------------------
+#
+# Use the trained model to make predictions on custom images and visualize
+# the predicted class labels along with the images.
+#
+
+def visualize_model_predictions(model,img_path):
+    was_training = model.training
+    model.eval()
+
+    img = Image.open(img_path)
+    img = data_transforms['val'](img)
+    img = img.unsqueeze(0)
+    img = img.to(device)
+
+    with torch.no_grad():
+        outputs = model(img)
+        _, preds = torch.max(outputs, 1)
+
+        ax = plt.subplot(2,2,1)
+        ax.axis('off')
+        ax.set_title(f'Predicted: {class_names[preds[0]]}')
+        imshow(img.cpu().data[0])
+        
+        model.train(mode=was_training)
+
+######################################################################
+#
+
+visualize_model_predictions(
+    model_conv,
+    img_path='data/hymenoptera_data/val/bees/72100438_73de9f17af.jpg'
+)
+
+plt.ioff()
+plt.show()
+
+
+######################################################################
+# Further Learning
+# -----------------
+#
+# If you would like to learn more about the applications of transfer learning,
+# checkout our `Quantized Transfer Learning for Computer Vision Tutorial <https://pytorch.org/tutorials/intermediate/quantized_transfer_learning_tutorial.html>`_.
+#
+#
+
diff --git a/beginner_source/transformer_tutorial.rst b/beginner_source/transformer_tutorial.rst
new file mode 100644
index 00000000000..0bb2ffc784f
--- /dev/null
+++ b/beginner_source/transformer_tutorial.rst
@@ -0,0 +1,8 @@
+Language Modeling with ``nn.Transformer`` and torchtext
+=======================================================
+
+The content is deprecated.
+
+.. raw:: html
+
+   <meta http-equiv="Refresh" content="1; url='https://pytorch.org/tutorials/'" />
diff --git a/beginner_source/translation_transformer.rst b/beginner_source/translation_transformer.rst
new file mode 100644
index 00000000000..892c1b73ca5
--- /dev/null
+++ b/beginner_source/translation_transformer.rst
@@ -0,0 +1,10 @@
+Language Translation with ``nn.Transformer`` and torchtext
+==========================================================
+
+This tutorial has been deprecated.
+
+Redirecting in 3 seconds...
+
+.. raw:: html
+
+   <meta http-equiv="Refresh" content="3; url='https://pytorch.org/tutorials/'" />
diff --git a/beginner_source/understanding_leaf_vs_nonleaf_tutorial.py b/beginner_source/understanding_leaf_vs_nonleaf_tutorial.py
new file mode 100644
index 00000000000..740c4d4bd76
--- /dev/null
+++ b/beginner_source/understanding_leaf_vs_nonleaf_tutorial.py
@@ -0,0 +1,339 @@
+"""
+Understanding requires_grad, retain_grad, Leaf, and Non-leaf Tensors
+====================================================================
+
+**Author:** `Justin Silver <https://github.com/j-silv>`__
+
+This tutorial explains the subtleties of ``requires_grad``,
+``retain_grad``, leaf, and non-leaf tensors using a simple example.
+
+Before starting, make sure you understand `tensors and how to manipulate
+them <https://docs.pytorch.org/tutorials/beginner/basics/tensorqs_tutorial.html>`__.
+A basic knowledge of `how autograd
+works <https://docs.pytorch.org/tutorials/beginner/basics/autogradqs_tutorial.html>`__
+would also be useful.
+
+"""
+
+
+######################################################################
+# Setup
+# -----
+#
+# First, make sure `PyTorch is
+# installed <https://pytorch.org/get-started/locally/>`__ and then import
+# the necessary libraries.
+#
+
+import torch
+import torch.nn.functional as F
+
+
+######################################################################
+# Next, we instantiate a simple network to focus on the gradients. This
+# will be an affine layer, followed by a ReLU activation, and ending with
+# a MSE loss between prediction and label tensors.
+#
+# .. math::
+#
+#    \mathbf{y}_{\text{pred}} = \text{ReLU}(\mathbf{x} \mathbf{W} + \mathbf{b})
+#
+# .. math::
+#
+#    L = \text{MSE}(\mathbf{y}_{\text{pred}}, \mathbf{y})
+#
+# Note that the ``requires_grad=True`` is necessary for the parameters
+# (``W`` and ``b``) so that PyTorch tracks operations involving those
+# tensors. We’ll discuss more about this in a future
+# `section <#requires-grad>`__.
+#
+
+# tensor setup
+x = torch.ones(1, 3)                      # input with shape: (1, 3)
+W = torch.ones(3, 2, requires_grad=True)  # weights with shape: (3, 2)
+b = torch.ones(1, 2, requires_grad=True)  # bias with shape: (1, 2)
+y = torch.ones(1, 2)                      # output with shape: (1, 2)
+
+# forward pass
+z = (x @ W) + b                           # pre-activation with shape: (1, 2)
+y_pred = F.relu(z)                        # activation with shape: (1, 2)
+loss = F.mse_loss(y_pred, y)              # scalar loss
+
+
+######################################################################
+# Leaf vs. non-leaf tensors
+# -------------------------
+#
+# After running the forward pass, PyTorch autograd has built up a `dynamic
+# computational
+# graph <https://docs.pytorch.org/tutorials/beginner/blitz/autograd_tutorial.html#computational-graph>`__
+# which is shown below. This is a `Directed Acyclic Graph
+# (DAG) <https://en.wikipedia.org/wiki/Directed_acyclic_graph>`__ which
+# keeps a record of input tensors (leaf nodes), all subsequent operations
+# on those tensors, and the intermediate/output tensors (non-leaf nodes).
+# The graph is used to compute gradients for each tensor starting from the
+# graph roots (outputs) to the leaves (inputs) using the `chain
+# rule <https://en.wikipedia.org/wiki/Chain_rule>`__ from calculus:
+#
+# .. math::
+#
+#    \mathbf{y} = \mathbf{f}_k\bigl(\mathbf{f}_{k-1}(\dots \mathbf{f}_1(\mathbf{x}) \dots)\bigr)
+#
+# .. math::
+#
+#    \frac{\partial \mathbf{y}}{\partial \mathbf{x}} =
+#    \frac{\partial \mathbf{f}_k}{\partial \mathbf{f}_{k-1}} \cdot
+#    \frac{\partial \mathbf{f}_{k-1}}{\partial \mathbf{f}_{k-2}} \cdot
+#    \cdots \cdot
+#    \frac{\partial \mathbf{f}_1}{\partial \mathbf{x}}
+#
+# .. figure:: /_static/img/understanding_leaf_vs_nonleaf/comp-graph-1.png
+#    :alt: Computational graph after forward pass
+#
+#    Computational graph after forward pass
+#
+# PyTorch considers a node to be a *leaf* if it is not the result of a
+# tensor operation with at least one input having ``requires_grad=True``
+# (e.g. ``x``, ``W``, ``b``, and ``y``), and everything else to be
+# *non-leaf* (e.g. ``z``, ``y_pred``, and ``loss``). You can verify this
+# programmatically by probing the ``is_leaf`` attribute of the tensors:
+#
+
+# prints True because new tensors are leafs by convention
+print(f"{x.is_leaf=}")
+
+# prints False because tensor is the result of an operation with at
+# least one input having requires_grad=True
+print(f"{z.is_leaf=}")
+
+
+######################################################################
+# The distinction between leaf and non-leaf determines whether the
+# tensor’s gradient will be stored in the ``grad`` property after the
+# backward pass, and thus be usable for `gradient
+# descent <https://en.wikipedia.org/wiki/Gradient_descent>`__. We’ll cover
+# this some more in the `following section <#retain-grad>`__.
+#
+# Let’s now investigate how PyTorch calculates and stores gradients for
+# the tensors in its computational graph.
+#
+
+
+######################################################################
+# ``requires_grad``
+# -----------------
+#
+# To build the computational graph which can be used for gradient
+# calculation, we need to pass in the ``requires_grad=True`` parameter to
+# a tensor constructor. By default, the value is ``False``, and thus
+# PyTorch does not track gradients on any created tensors. To verify this,
+# try not setting ``requires_grad``, re-run the forward pass, and then run
+# backpropagation. You will see:
+#
+# ::
+#
+#    >>> loss.backward()
+#    RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn
+#
+# This error means that autograd can’t backpropagate to any leaf tensors
+# because ``loss`` is not tracking gradients. If you need to change the
+# property, you can call ``requires_grad_()`` on the tensor (notice the \_
+# suffix).
+#
+# We can sanity check which nodes require gradient calculation, just like
+# we did above with the ``is_leaf`` attribute:
+#
+
+print(f"{x.requires_grad=}") # prints False because requires_grad=False by default
+print(f"{W.requires_grad=}") # prints True because we set requires_grad=True in constructor
+print(f"{z.requires_grad=}") # prints True because tensor is a non-leaf node
+
+
+######################################################################
+# It’s useful to remember that a non-leaf tensor has
+# ``requires_grad=True`` by definition, since backpropagation would fail
+# otherwise. If the tensor is a leaf, then it will only have
+# ``requires_grad=True`` if it was specifically set by the user. Another
+# way to phrase this is that if at least one of the inputs to a tensor
+# requires the gradient, then it will require the gradient as well.
+#
+# There are two exceptions to this rule:
+#
+# 1. Any ``nn.Module`` that has ``nn.Parameter`` will have
+#    ``requires_grad=True`` for its parameters (see
+#    `here <https://docs.pytorch.org/tutorials/beginner/basics/quickstart_tutorial.html#creating-models>`__)
+# 2. Locally disabling gradient computation with context managers (see
+#    `here <https://docs.pytorch.org/docs/stable/notes/autograd.html#locally-disabling-gradient-computation>`__)
+#
+# In summary, ``requires_grad`` tells autograd which tensors need to have
+# their gradients calculated for backpropagation to work. This is
+# different from which tensors have their ``grad`` field populated, which
+# is the topic of the next section.
+#
+
+
+######################################################################
+# ``retain_grad``
+# ---------------
+#
+# To actually perform optimization (e.g. SGD, Adam, etc.), we need to run
+# the backward pass so that we can extract the gradients.
+#
+
+loss.backward()
+
+
+######################################################################
+# Calling ``backward()`` populates the ``grad`` field of all leaf tensors
+# which had ``requires_grad=True``. The ``grad`` is the gradient of the
+# loss with respect to the tensor we are probing. Before running
+# ``backward()``, this attribute is set to ``None``.
+#
+
+print(f"{W.grad=}")
+print(f"{b.grad=}")
+
+
+######################################################################
+# You might be wondering about the other tensors in our network. Let’s
+# check the remaining leaf nodes:
+#
+
+# prints all None because requires_grad=False
+print(f"{x.grad=}")
+print(f"{y.grad=}")
+
+
+######################################################################
+# The gradients for these tensors haven’t been populated because we did
+# not explicitly tell PyTorch to calculate their gradient
+# (``requires_grad=False``).
+#
+# Let’s now look at an intermediate non-leaf node:
+#
+
+print(f"{z.grad=}")
+
+
+######################################################################
+# PyTorch returns ``None`` for the gradient and also warns us that a
+# non-leaf node’s ``grad`` attribute is being accessed. Although autograd
+# has to calculate intermediate gradients for backpropagation to work, it
+# assumes you don’t need to access the values afterwards. To change this
+# behavior, we can use the ``retain_grad()`` function on a tensor. This
+# tells the autograd engine to populate that tensor’s ``grad`` after
+# calling ``backward()``.
+#
+
+# we have to re-run the forward pass
+z = (x @ W) + b
+y_pred = F.relu(z)
+loss = F.mse_loss(y_pred, y)
+
+# tell PyTorch to store the gradients after backward()
+z.retain_grad()
+y_pred.retain_grad()
+loss.retain_grad()
+
+# have to zero out gradients otherwise they would accumulate
+W.grad = None
+b.grad = None
+
+# backpropagation
+loss.backward()
+
+# print gradients for all tensors that have requires_grad=True
+print(f"{W.grad=}")
+print(f"{b.grad=}")
+print(f"{z.grad=}")
+print(f"{y_pred.grad=}")
+print(f"{loss.grad=}")
+
+
+######################################################################
+# We get the same result for ``W.grad`` as before. Also note that because
+# the loss is scalar, the gradient of the loss with respect to itself is
+# simply ``1.0``.
+#
+# If we look at the state of the computational graph now, we see that the
+# ``retains_grad`` attribute has changed for the intermediate tensors. By
+# convention, this attribute will print ``False`` for any leaf node, even
+# if it requires its gradient.
+#
+# .. figure:: /_static/img/understanding_leaf_vs_nonleaf/comp-graph-2.png
+#    :alt: Computational graph after backward pass
+#
+#    Computational graph after backward pass
+#
+# If you call ``retain_grad()`` on a non-leaf node, it results in a no-op.
+# If we call ``retain_grad()`` on a node that has ``requires_grad=False``,
+# PyTorch actually throws an error, since it can’t store the gradient if
+# it is never calculated.
+#
+# ::
+#
+#    >>> x.retain_grad()
+#    RuntimeError: can't retain_grad on Tensor that has requires_grad=False
+#
+
+
+######################################################################
+# Summary table
+# -------------
+#
+# Using ``retain_grad()`` and ``retains_grad`` only make sense for
+# non-leaf nodes, since the ``grad`` attribute will already be populated
+# for leaf tensors that have ``requires_grad=True``. By default, these
+# non-leaf nodes do not retain (store) their gradient after
+# backpropagation. We can change that by rerunning the forward pass,
+# telling PyTorch to store the gradients, and then performing
+# backpropagation.
+#
+# The following table can be used as a reference which summarizes the
+# above discussions. The following scenarios are the only ones that are
+# valid for PyTorch tensors.
+#
+#
+#
+# +----------------+------------------------+------------------------+---------------------------------------------------+-------------------------------------+
+# |  ``is_leaf``   |   ``requires_grad``    |   ``retains_grad``     |  ``require_grad()``                               |   ``retain_grad()``                 |
+# +================+========================+========================+===================================================+=====================================+
+# | ``True``       | ``False``              | ``False``              | sets ``requires_grad`` to ``True`` or ``False``   | no-op                               |
+# +----------------+------------------------+------------------------+---------------------------------------------------+-------------------------------------+
+# | ``True``       | ``True``               | ``False``              | sets ``requires_grad`` to ``True`` or ``False``   | no-op                               |
+# +----------------+------------------------+------------------------+---------------------------------------------------+-------------------------------------+
+# | ``False``      | ``True``               | ``False``              | no-op                                             | sets ``retains_grad`` to ``True``   |
+# +----------------+------------------------+------------------------+---------------------------------------------------+-------------------------------------+
+# | ``False``      | ``True``               | ``True``               | no-op                                             | no-op                               |
+# +----------------+------------------------+------------------------+---------------------------------------------------+-------------------------------------+
+#
+
+
+######################################################################
+# Conclusion
+# ----------
+#
+# In this tutorial, we covered when and how PyTorch computes gradients for
+# leaf and non-leaf tensors. By using ``retain_grad``, we can access the
+# gradients of intermediate tensors within autograd’s computational graph.
+#
+# If you would like to learn more about how PyTorch’s autograd system
+# works, please visit the `references <#references>`__ below. If you have
+# any feedback for this tutorial (improvements, typo fixes, etc.) then
+# please use the `PyTorch Forums <https://discuss.pytorch.org/>`__ and/or
+# the `issue tracker <https://github.com/pytorch/tutorials/issues>`__ to
+# reach out.
+#
+
+
+######################################################################
+# References
+# ----------
+#
+# -  `A Gentle Introduction to
+#    torch.autograd <https://docs.pytorch.org/tutorials/beginner/blitz/autograd_tutorial.html>`__
+# -  `Automatic Differentiation with
+#    torch.autograd <https://docs.pytorch.org/tutorials/beginner/basics/autogradqs_tutorial>`__
+# -  `Autograd
+#    mechanics <https://docs.pytorch.org/docs/stable/notes/autograd.html>`__
+#
\ No newline at end of file
diff --git a/build.sh b/build.sh
old mode 100644
new mode 100755
diff --git a/cleanup.sh b/cleanup.sh
new file mode 100644
index 00000000000..17eee8b01e9
--- /dev/null
+++ b/cleanup.sh
@@ -0,0 +1 @@
+rm -rf __pycache__/ _build/ advanced/ beginner/ intermediate/
diff --git a/compilers_index.rst b/compilers_index.rst
new file mode 100644
index 00000000000..7befbbce9f5
--- /dev/null
+++ b/compilers_index.rst
@@ -0,0 +1,216 @@
+Compilers
+=========
+
+Explore PyTorch compilers to optimize and deploy models efficiently.
+Learn about APIs like ``torch.compile`` and ``torch.export``
+that let you enhance model performance and streamline deployment
+processes.
+Explore advanced topics such as compiled autograd, dynamic compilation
+control, as well as third-party backend solutions.
+
+.. warning::
+
+   TorchScript is no longer in active development.
+
+.. raw:: html
+
+    <div id="tutorial-cards-container">
+
+    <nav class="navbar navbar-expand-lg navbar-light tutorials-nav col-12">
+        <div class="tutorial-tags-container">
+            <div id="dropdown-filter-tags">
+                <div class="tutorial-filter-menu">
+                    <div class="tutorial-filter filter-btn all-tag-selected" data-tag="all">All</div>
+                </div>
+            </div>
+        </div>
+    </nav>
+
+    <hr class="tutorials-hr">
+
+    <div class="row">
+
+    <div id="tutorial-cards">
+    <div class="list">
+
+.. customcarditem::
+   :header: torch.compile Tutorial
+   :card_description: Speed up your models with minimal code changes using torch.compile, the latest PyTorch compiler solution.
+   :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: intermediate/torch_compile_tutorial.html
+   :tags: Model-Optimization,torch.compile
+
+.. customcarditem::
+   :header: Compiled Autograd: Capturing a larger backward graph for torch.compile
+   :card_description: Learn how to use compiled autograd to capture a larger backward graph.
+   :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: intermediate/compiled_autograd_tutorial.html
+   :tags: Model-Optimization,CUDA,torch.compile
+
+.. customcarditem::
+   :header: Inductor CPU Backend Debugging and Profiling
+   :card_description: Learn the usage, debugging and performance profiling for ``torch.compile`` with Inductor CPU backend.
+   :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: intermediate/inductor_debug_cpu.html
+   :tags: Model-Optimization,torch.compile
+
+.. customcarditem::
+   :header: Dynamic Compilation Control with torch.compiler.set_stance
+   :card_description: Learn how to use torch.compiler.set_stance
+   :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: recipes/torch_compiler_set_stance_tutorial.html
+   :tags: Model-Optimization,torch.compile
+
+.. customcarditem::
+   :header: Demonstration of torch.export flow, common challenges and the solutions to address them
+   :card_description: Learn how to export models for popular usecases
+   :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: recipes/torch_export_challenges_solutions.html
+   :tags: Model-Optimization,torch.compile
+
+.. customcarditem::
+   :header: (beta) Compiling the Optimizer with torch.compile
+   :card_description: Speed up the optimizer using torch.compile
+   :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: recipes/compiling_optimizer.html
+   :tags: Model-Optimization,torch.compile
+
+.. customcarditem::
+   :header: (beta) Running the compiled optimizer with an LR Scheduler
+   :card_description: Speed up training with LRScheduler and torch.compiled optimizer
+   :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: recipes/compiling_optimizer_lr_scheduler.html
+   :tags: Model-Optimization,torch.compile
+
+.. customcarditem::
+   :header: Using User-Defined Triton Kernels with ``torch.compile``
+   :card_description: Learn how to use user-defined kernels with ``torch.compile``
+   :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: recipes/torch_compile_user_defined_triton_kernel_tutorial.html
+   :tags: Model-Optimization,torch.compile
+
+.. customcarditem::
+   :header: Compile Time Caching in ``torch.compile``
+   :card_description: Learn how to use compile time caching in ``torch.compile``
+   :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: recipes/torch_compile_caching_tutorial.html
+   :tags: Model-Optimization,torch.compile
+
+.. customcarditem::
+   :header: Compile Time Caching Configurations
+   :card_description: Learn how to configure compile time caching in ``torch.compile``
+   :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: recipes/torch_compile_caching_configuration_tutorial.html
+   :tags: Model-Optimization,torch.compile
+
+.. customcarditem::
+   :header: Reducing torch.compile cold start compilation time with regional compilation
+   :card_description: Learn how to use regional compilation to control cold start compile time
+   :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: recipes/regional_compilation.html
+   :tags: Model-Optimization,torch.compile
+
+.. Export
+.. customcarditem::
+   :header: torch.export AOTInductor Tutorial for Python runtime
+   :card_description: Learn an end-to-end example of how to use AOTInductor for python runtime.
+   :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: recipes/torch_export_aoti_python.html
+   :tags: Basics,torch.export
+
+.. customcarditem::
+   :header: Deep dive into torch.export
+   :card_description: Learn how to use torch.export to export PyTorch models into standardized model representations.
+   :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: intermediate/torch_export_tutorial.html
+   :tags: Basics,torch.export
+
+.. ONNX
+.. customcarditem::
+   :header: (optional) Exporting a PyTorch model to ONNX using TorchDynamo backend and Running it using ONNX Runtime
+   :card_description: Build a image classifier model in PyTorch and convert it to ONNX before deploying it with ONNX Runtime.
+   :image: _static/img/thumbnails/cropped/Exporting-PyTorch-Models-to-ONNX-Graphs.png
+   :link: beginner/onnx/export_simple_model_to_onnx_tutorial.html
+   :tags: Production,ONNX,Backends
+
+.. customcarditem::
+   :header: Extending the ONNX exporter operator support
+   :card_description: Demonstrate end-to-end how to address unsupported operators in ONNX.
+   :image: _static/img/thumbnails/cropped/Exporting-PyTorch-Models-to-ONNX-Graphs.png
+   :link: beginner/onnx/onnx_registry_tutorial.html
+   :tags: Production,ONNX,Backends
+
+.. customcarditem::
+   :header: Exporting a model with control flow to ONNX
+   :card_description: Demonstrate how to handle control flow logic while exporting a PyTorch model to ONNX.
+   :image: _static/img/thumbnails/cropped/Exporting-PyTorch-Models-to-ONNX-Graphs.png
+   :link: beginner/onnx/export_control_flow_model_to_onnx_tutorial.html
+   :tags: Production,ONNX,Backends
+
+.. Code Transformations with FX
+.. customcarditem::
+   :header: Building a Convolution/Batch Norm fuser in FX
+   :card_description: Build a simple FX pass that fuses batch norm into convolution to improve performance during inference.
+   :image: _static/img/thumbnails/cropped/Deploying-PyTorch-in-Python-via-a-REST-API-with-Flask.png
+   :link: intermediate/torch_compile_conv_bn_fuser
+   :tags: FX
+
+.. customcarditem::
+   :header: Building a Simple Performance Profiler with FX
+   :card_description: Build a simple FX interpreter to record the runtime of op, module, and function calls and report statistics
+   :image: _static/img/thumbnails/cropped/Deploying-PyTorch-in-Python-via-a-REST-API-with-Flask.png
+   :link: intermediate/fx_profiling_tutorial.html
+   :tags: FX
+
+.. raw:: html
+
+    </div>
+    </div>
+
+.. End of tutorial cards section
+.. -----------------------------------------
+.. Page TOC
+.. -----------------------------------------
+.. toctree::
+   :maxdepth: 2
+   :hidden:
+   :caption: torch.compile
+
+   intermediate/torch_compile_tutorial
+   intermediate/compiled_autograd_tutorial
+   intermediate/inductor_debug_cpu
+   recipes/torch_compiler_set_stance_tutorial
+   recipes/torch_export_challenges_solutions
+   recipes/compiling_optimizer
+   recipes/compiling_optimizer_lr_scheduler
+   recipes/torch_compile_user_defined_triton_kernel_tutorial
+   recipes/torch_compile_caching_tutorial
+   recipes/regional_compilation
+
+.. toctree::
+   :maxdepth: 2
+   :hidden:
+   :caption: torch.export
+
+   intermediate/torch_export_tutorial
+   recipes/torch_export_aoti_python
+   recipes/torch_export_challenges_solutions
+
+.. toctree::
+   :maxdepth: 2
+   :hidden:
+   :caption: ONNX
+
+   beginner/onnx/intro_onnx
+   beginner/onnx/export_simple_model_to_onnx_tutorial
+   beginner/onnx/onnx_registry_tutorial
+   beginner/onnx/export_control_flow_model_to_onnx_tutorial
+
+.. toctree::
+   :maxdepth: 2
+   :includehidden:
+   :hidden:
+   :caption: Code Transforms with FX
+
+   intermediate/torch_compile_conv_bn_fuser
+   intermediate/fx_profiling_tutorial
diff --git a/conf.py b/conf.py
index e6437ac1ac4..0cb34fd02b3 100644
--- a/conf.py
+++ b/conf.py
@@ -12,6 +12,16 @@
 #
 # All configuration values have a default; values that are commented out
 # serve to show the default.
+#
+
+# Because the sphinx gallery might take a long time, you can control specific
+# files that generate the results using `GALLERY_PATTERN` environment variable,
+# For example to run only `neural_style_transfer_tutorial.py`:
+#   GALLERY_PATTERN="neural_style_transfer_tutorial.py" make html
+# or
+#   GALLERY_PATTERN="neural_style_transfer_tutorial.py" sphinx-build . _build
+#
+# GALLERY_PATTERN variable respects regular expressions.
 
 # If extensions (or modules to document with autodoc) are in another directory,
 # add these directories to sys.path here. If the directory is relative to the
@@ -19,20 +29,93 @@
 #
 import os
 import sys
-sys.path.insert(0, os.path.abspath('.'))
 
-import torch
+sys.path.insert(0, os.path.abspath("."))
+sys.path.insert(0, os.path.abspath("./.jenkins"))
+import pytorch_sphinx_theme2
+
+html_theme = "pytorch_sphinx_theme2"
+html_theme_path = [pytorch_sphinx_theme2.get_html_theme_path()]
+import distutils.file_util
 import glob
+import random
+import re
 import shutil
-from custom_directives import IncludeDirective, GalleryItemDirective, CustomGalleryItemDirective
+from pathlib import Path
+
+import pandocfilters
+import plotly.io as pio
+import pypandoc
+import torch
+from get_sphinx_filenames import SPHINX_SHOULD_RUN
+
+pio.renderers.default = "sphinx_gallery"
+import multiprocessing
 
+import sphinx_gallery.gen_rst
+from redirects import redirects
+
+
+# Monkey patch sphinx gallery to run each example in an isolated process so that
+# we don't need to worry about examples changing global state.
+#
+# Alt option 1: Parallelism was added to sphinx gallery (a later version that we
+# are not using yet) using joblib, but it seems to result in errors for us, and
+# it has no effect if you set parallel = 1 (it will not put each file run into
+# its own process and run singly) so you need parallel >= 2, and there may be
+# tutorials that cannot be run in parallel.
+#
+# Alt option 2: Run sphinx gallery once per file (similar to how we shard in CI
+# but with shard sizes of 1), but running sphinx gallery for each file has a
+# ~5min overhead, resulting in the entire suite taking ~2x time
+def call_fn(func, args, kwargs, result_queue):
+    try:
+        result = func(*args, **kwargs)
+        result_queue.put((True, result))
+    except Exception as e:
+        result_queue.put((False, str(e)))
+
+
+def call_in_subprocess(func):
+    def wrapper(*args, **kwargs):
+        result_queue = multiprocessing.Queue()
+        p = multiprocessing.Process(
+            target=call_fn, args=(func, args, kwargs, result_queue)
+        )
+        p.start()
+        p.join()
+        success, result = result_queue.get()
+        if success:
+            return result
+        else:
+            raise RuntimeError(f"Error in subprocess: {result}")
+
+    return wrapper
+
+
+# Windows does not support multiprocessing with fork and mac has issues with
+# fork so we do not monkey patch sphinx gallery to run in subprocesses.
+if (
+    os.getenv("TUTORIALS_ISOLATE_BUILD", "1") == "1"
+    and not sys.platform.startswith("win")
+    and not sys.platform == "darwin"
+):
+    sphinx_gallery.gen_rst.generate_file_rst = call_in_subprocess(
+        sphinx_gallery.gen_rst.generate_file_rst
+    )
 
 try:
     import torchvision
 except ImportError:
     import warnings
+
     warnings.warn('unable to load "torchvision" package')
-import sphinx_rtd_theme
+
+rst_epilog = """
+.. |edit| image:: /_static/pencil-16.png
+           :width: 16px
+           :height: 16px
+"""
 
 # -- General configuration ------------------------------------------------
 
@@ -40,81 +123,219 @@
 #
 # needs_sphinx = '1.0'
 
+html_meta = {
+    "description": "Master PyTorch with our step-by-step tutorials for all skill levels. Start your journey to becoming a PyTorch expert today!",
+    "keywords": "PyTorch, tutorials, Getting Started, deep learning, AI",
+    "author": "PyTorch Contributors",
+}
+
 # Add any Sphinx extension module names here, as strings. They can be
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 # ones.
-extensions = ['sphinx.ext.mathjax',
-              'sphinx_gallery.gen_gallery']
+extensions = [
+    "sphinxcontrib.katex",
+    "sphinx.ext.intersphinx",
+    "sphinx_copybutton",
+    "sphinx_gallery.gen_gallery",
+    "sphinx_design",
+    "sphinx_sitemap",
+    "sphinx_reredirects",
+    "sphinxcontrib.mermaid",
+]
+
+intersphinx_mapping = {
+    "torch": ("https://docs.pytorch.org/docs/stable/", None),
+    "tensordict": ("https://docs.pytorch.org/tensordict/stable", None),
+    "torchrl": ("https://docs.pytorch.org/rl/stable", None),
+    "torchaudio": ("https://docs.pytorch.org/audio/stable/", None),
+    "torchtext": ("https://docs.pytorch.org/text/stable/", None),
+    "torchvision": ("https://docs.pytorch.org/vision/stable/", None),
+}
 
+html_meta = {
+    "description": "Master PyTorch with our step-by-step tutorials for all skill levels. Start your journey to becoming a PyTorch expert today!",
+    "keywords": "PyTorch, tutorials, Getting Started, deep learning, AI",
+    "author": "PyTorch Contributors",
+}
 
-# -- Sphinx-gallery configuration --------------------------------------------
 
 
+# -- Sphinx-gallery configuration --------------------------------------------
+
 sphinx_gallery_conf = {
-    'examples_dirs': ['beginner_source', 'intermediate_source',
-                      'advanced_source'],
-    'gallery_dirs': ['beginner', 'intermediate', 'advanced'],
-    'filename_pattern': 'tutorial.py'
+    "examples_dirs": [
+        "beginner_source",
+        "intermediate_source",
+        "advanced_source",
+        "recipes_source",
+        "unstable_source",
+    ],
+    "gallery_dirs": ["beginner", "intermediate", "advanced", "recipes", "unstable"],
+    "filename_pattern": re.compile(SPHINX_SHOULD_RUN),
+    "promote_jupyter_magic": True,
+    "backreferences_dir": None,
+    "write_computation_times": True,
+    "download_all_examples": False,
+    "show_signature": False,
+    "first_notebook_cell": (
+        "# For tips on running notebooks in Google Colab, see\n"
+        "# https://docs.pytorch.org/tutorials/beginner/colab\n"
+        "%matplotlib inline"
+    ),
+    "ignore_pattern": r"_torch_export_nightly_tutorial.py",
+    "pypandoc": {
+        "extra_args": ["--mathjax", "--toc"],
+        "filters": [".jenkins/custom_pandoc_filter.py"],
+    },
 }
 
-for i in range(len(sphinx_gallery_conf['examples_dirs'])):
-    gallery_dir = sphinx_gallery_conf['gallery_dirs'][i]
-    source_dir = sphinx_gallery_conf['examples_dirs'][i]
-    # Create gallery dirs if it doesn't exist
-    try:
-        os.mkdir(gallery_dir)
-    except OSError:
-        pass
+html_additional_pages = {
+    "404": "404.html",
+}
 
-    # Copy rst files from source dir to gallery dir
-    for f in glob.glob(os.path.join(source_dir, '*.rst')):
-        shutil.copy(f, gallery_dir)
 
+html_baseurl = "https://docs.pytorch.org/tutorials/"  # needed for sphinx-sitemap
+sitemap_locales = [None]
+sitemap_excludes = [
+    "search.html",
+    "genindex.html",
+]
+sitemap_url_scheme = "{link}"
+
+html_theme_options = {
+    "navigation_with_keys": False,
+    "analytics_id": "GTM-T8XT4PS",
+    "logo": {
+        "text": "",
+    },
+    "icon_links": [
+        {
+            "name": "X",
+            "url": "https://x.com/PyTorch",
+            "icon": "fa-brands fa-x-twitter",
+        },
+        {
+            "name": "GitHub",
+            "url": "https://github.com/pytorch/tutorials",
+            "icon": "fa-brands fa-github",
+        },
+        {
+            "name": "Discourse",
+            "url": "https://dev-discuss.pytorch.org/",
+            "icon": "fa-brands fa-discourse",
+        },
+        {
+            "name": "PyPi",
+            "url": "https://pypi.org/project/torch/",
+            "icon": "fa-brands fa-python",
+        },
+    ],
+    "use_edit_page_button": True,
+    "header_links_before_dropdown": 9,
+    "navbar_start": ["pytorch_version"],
+    "navbar_center": "navbar-nav",
+    "display_version": True,
+    "pytorch_project": "tutorials",
+    "canonical_url": "https://docs.pytorch.org/tutorials/",
+}
+
+theme_variables = pytorch_sphinx_theme2.get_theme_variables()
+
+html_context = {
+    "theme_variables": theme_variables,
+    "display_github": True,
+    "github_url": "https://github.com",
+    "github_user": "pytorch",
+    "github_repo": "tutorials",
+    "feedback_url": "https://github.com/pytorch/tutorials",
+    "github_version": "main",
+    "doc_path": ".",
+    "library_links": theme_variables.get("library_links", []),
+    #"pytorch_project": "tutorials",
+}
 
-# Add any paths that contain templates here, relative to this directory.
 
+if os.getenv("GALLERY_PATTERN"):
+    # GALLERY_PATTERN is to be used when you want to work on a single
+    # tutorial.  Previously this was fed into filename_pattern, but
+    # if you do that, you still end up parsing all of the other Python
+    # files which takes a few seconds.  This strategy is better, as
+    # ignore_pattern also skips parsing.
+    # See https://github.com/sphinx-gallery/sphinx-gallery/issues/721
+    # for a more detailed description of the issue.
+    sphinx_gallery_conf["ignore_pattern"] = (
+        r"/(?!" + re.escape(os.getenv("GALLERY_PATTERN")) + r")[^/]+$"
+    )
 
-templates_path = ['_templates']
+for i in range(len(sphinx_gallery_conf["examples_dirs"])):
+    gallery_dir = Path(sphinx_gallery_conf["gallery_dirs"][i])
+    source_dir = Path(sphinx_gallery_conf["examples_dirs"][i])
+
+    # Copy rst files from source dir to gallery dir
+    for f in source_dir.rglob("*.rst"):
+        f_dir = Path(f).parent
+        gallery_subdir_path = gallery_dir / f_dir.relative_to(source_dir)
+        gallery_subdir_path.mkdir(parents=True, exist_ok=True)
+        distutils.file_util.copy_file(f, gallery_subdir_path, update=True)
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = [
+    "_templates",
+    os.path.join(os.path.dirname(pytorch_sphinx_theme2.__file__), "templates"),
+]
 
 # The suffix(es) of source filenames.
 # You can specify multiple suffix as a list of string:
 #
 # source_suffix = ['.rst', '.md']
-source_suffix = '.rst'
+source_suffix = ".rst"
 
 # The master toctree document.
-master_doc = 'index'
+master_doc = "index"
 
 # General information about the project.
-project = 'PyTorch Tutorials'
-copyright = '2017, PyTorch'
-author = 'PyTorch contributors'
+project = "PyTorch Tutorials"
+copyright = "2024, PyTorch"
+author = "PyTorch contributors"
 
 # The version info for the project you're documenting, acts as replacement for
 # |version| and |release|, also used in various other places throughout the
 # built documents.
 #
 # The short X.Y version.
-version = ''
+version = "v" + str(torch.__version__)
 # The full version, including alpha/beta/rc tags.
-release = ''
+release = str(torch.__version__)
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
 #
 # This is also used if you do content translation via gettext catalogs.
 # Usually you set "language" from the command line for these cases.
-language = None
+language = "en"
 
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
 # This patterns also effect to html_static_path and html_extra_path
-exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
-exclude_patterns += sphinx_gallery_conf['examples_dirs']
-exclude_patterns += ['*/index.rst']
+exclude_patterns = [
+    "_build",
+    "Thumbs.db",
+    ".DS_Store",
+    "src/pytorch-sphinx-theme/docs*",
+]
+exclude_patterns += sphinx_gallery_conf["examples_dirs"]
+exclude_patterns += ["*/index.rst"]
+
+
+# Handling for HuggingFace Hub jinja templates
+def handle_jinja_templates(app, docname, source):
+    if "huggingface_hub/templates" in docname:
+        # Replace Jinja templates with quoted strings
+        source[0] = re.sub(r"(\{\{.*?\}\})", r'"\1"', source[0])
+
 
 # The name of the Pygments (syntax highlighting) style to use.
-pygments_style = 'sphinx'
+pygments_style = "sphinx"
 
 # If true, `todo` and `todoList` produce output, else they produce nothing.
 todo_include_todos = False
@@ -142,7 +363,7 @@
 # # Add any paths that contain custom static files (such as style sheets) here,
 # # relative to this directory. They are copied after the builtin static files,
 # # so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['_static']
+html_static_path = ["_static"]
 
 # # Custom sidebar templates, maps document names to template names.
 # html_sidebars = {
@@ -151,20 +372,10 @@
 # }
 
 
-html_theme = 'sphinx_rtd_theme'
-html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
-html_logo = '_static/img/pytorch-logo-dark.svg'
-html_theme_options = {
-    'collapse_navigation': False,
-    'display_version': False,
-    'logo_only': False,
-}
-
-
 # -- Options for HTMLHelp output ------------------------------------------
 
 # Output file base name for HTML help builder.
-htmlhelp_basename = 'PyTorchTutorialsdoc'
+htmlhelp_basename = "PyTorchTutorialsdoc"
 
 
 # -- Options for LaTeX output ---------------------------------------------
@@ -173,15 +384,12 @@
     # The paper size ('letterpaper' or 'a4paper').
     #
     # 'papersize': 'letterpaper',
-
     # The font size ('10pt', '11pt' or '12pt').
     #
     # 'pointsize': '10pt',
-
     # Additional stuff for the LaTeX preamble.
     #
     # 'preamble': '',
-
     # Latex figure (float) alignment
     #
     # 'figure_align': 'htbp',
@@ -191,8 +399,13 @@
 # (source start file, target name, title,
 #  author, documentclass [howto, manual, or own class]).
 latex_documents = [
-    (master_doc, 'PyTorchTutorials.tex', 'PyTorch Tutorials',
-     'Sasank, PyTorch contributors', 'manual'),
+    (
+        master_doc,
+        "PyTorchTutorials.tex",
+        "PyTorch Tutorials",
+        "Sasank, PyTorch contributors",
+        "manual",
+    ),
 ]
 
 
@@ -200,10 +413,7 @@
 
 # One entry per manual page. List of tuples
 # (source start file, name, description, authors, manual section).
-man_pages = [
-    (master_doc, 'pytorchtutorials', 'PyTorch Tutorials',
-     [author], 1)
-]
+man_pages = [(master_doc, "pytorchtutorials", "PyTorch Tutorials", [author], 1)]
 
 
 # -- Options for Texinfo output -------------------------------------------
@@ -212,17 +422,48 @@
 # (source start file, target name, title, author,
 #  dir menu entry, description, category)
 texinfo_documents = [
-    (master_doc, 'PyTorchTutorials', 'PyTorch Tutorials',
-     author, 'PyTorchTutorials', 'One line description of project.',
-     'Miscellaneous'),
+    (
+        master_doc,
+        "PyTorchTutorials",
+        "PyTorch Tutorials",
+        author,
+        "PyTorchTutorials",
+        "One line description of project.",
+        "Miscellaneous",
+    ),
+]
+
+html_css_files = [
+    "https://cdn.jsdelivr.net/npm/katex@0.10.0-beta/dist/katex.min.css",
 ]
 
 
+def html_page_context(app, pagename, templatename, context, doctree):
+    # Check if the page is in gallery directories
+    for gallery_dir in sphinx_gallery_conf["gallery_dirs"]:
+        if pagename.startswith(gallery_dir):
+            # Get corresponding examples directory
+            examples_dir = sphinx_gallery_conf["examples_dirs"][
+                sphinx_gallery_conf["gallery_dirs"].index(gallery_dir)
+            ]
+
+            # Calculate relative path within the gallery
+            rel_path = (
+                pagename[len(gallery_dir) + 1 :] if pagename != gallery_dir else ""
+            )
+
+            # Check for .py file in examples directory
+            py_path = os.path.join(app.srcdir, examples_dir, rel_path + ".py")
+
+            # If a .py file exists, this page was generated from Python
+            if os.path.exists(py_path):
+                context["display_github"] = False
+                return
+
+    # Enable for all other pages
+    context["display_github"] = True
+
+
 def setup(app):
-    # Custom CSS
-    app.add_stylesheet('css/pytorch_theme.css')
-    app.add_stylesheet('https://fonts.googleapis.com/css?family=Lato')
-    # Custom directives
-    app.add_directive('includenodoc', IncludeDirective)
-    app.add_directive('galleryitem', GalleryItemDirective)
-    app.add_directive('customgalleryitem', CustomGalleryItemDirective)
+    app.connect("source-read", handle_jinja_templates)
+    app.connect("html-page-context", html_page_context)
diff --git a/custom_directives.py b/custom_directives.py
deleted file mode 100644
index c4042b88fdb..00000000000
--- a/custom_directives.py
+++ /dev/null
@@ -1,207 +0,0 @@
-from docutils.parsers.rst import Directive, directives
-from docutils.statemachine import StringList 
-from docutils import nodes
-import re
-import os
-import sphinx_gallery
-
-try:
-    FileNotFoundError
-except NameError:
-    FileNotFoundError = IOError
-
-
-class IncludeDirective(Directive):
-    """Include source file without docstring at the top of file.
-
-    Implementation just replaces the first docstring found in file
-    with '' once.
-
-    Example usage:
-
-    .. includenodoc:: /beginner/examples_tensor/two_layer_net_tensor.py
-
-    """
-
-    # defines the parameter the directive expects
-    # directives.unchanged means you get the raw value from RST
-    required_arguments = 1
-    optional_arguments = 0
-    final_argument_whitespace = True
-    has_content = False
-    add_index = False
-
-    docstring_pattern = r'"""(?P<docstring>(?:.|[\r\n])*?)"""\n'
-    docstring_regex = re.compile(docstring_pattern)
-
-    def run(self):
-        document = self.state.document
-        env = document.settings.env
-        rel_filename, filename = env.relfn2path(self.arguments[0])
-
-        try:
-            text = open(filename).read()
-            text_no_docstring = self.docstring_regex.sub('', text, count=1)
-
-            code_block = nodes.literal_block(text=text_no_docstring)
-            return [code_block]
-        except FileNotFoundError as e:
-            print(e)
-            return []
-
-
-class GalleryItemDirective(Directive):
-    """
-    Create a sphinx gallery thumbnail for insertion anywhere in docs.
-
-    Optionally, you can specify the custom figure and intro/tooltip for the
-    thumbnail.
-
-    Example usage:
-
-    .. galleryitem:: intermediate/char_rnn_generation_tutorial.py
-        :figure: _static/img/char_rnn_generation.png
-        :intro: Put your custom intro here.
-
-    If figure is specified, a thumbnail will be made out of it and stored in
-    _static/thumbs. Therefore, consider _static/thumbs as a 'built' directory.
-    """
-
-    required_arguments = 1
-    optional_arguments = 0
-    final_argument_whitespace = True
-    option_spec = {'figure': directives.unchanged,
-                   'intro': directives.unchanged}
-    has_content = False
-    add_index = False
-
-    def run(self):
-        args = self.arguments
-        fname = args[-1]
-
-        env = self.state.document.settings.env
-        fname, abs_fname = env.relfn2path(fname)
-        basename = os.path.basename(fname)
-        dirname = os.path.dirname(fname)
-
-        try:
-            if 'intro' in self.options:
-                intro = self.options['intro'][:195] + '...'
-            else:
-                intro = sphinx_gallery.gen_rst.extract_intro(abs_fname)
-
-            thumbnail_rst = sphinx_gallery.backreferences._thumbnail_div(
-                dirname, basename, intro)
-
-            if 'figure' in self.options:
-                rel_figname, figname = env.relfn2path(self.options['figure'])
-                save_figname = os.path.join('_static/thumbs/',
-                                            os.path.basename(figname))
-
-                try:
-                    os.makedirs('_static/thumbs')
-                except OSError:
-                    pass
-
-                sphinx_gallery.gen_rst.scale_image(figname, save_figname,
-                                                   400, 280)
-                # replace figure in rst with simple regex
-                thumbnail_rst = re.sub(r'..\sfigure::\s.*\.png',
-                                       '.. figure:: /{}'.format(save_figname),
-                                       thumbnail_rst)
-
-            thumbnail = StringList(thumbnail_rst.split('\n'))
-            thumb = nodes.paragraph()
-            self.state.nested_parse(thumbnail, self.content_offset, thumb)
-
-            return [thumb]
-        except FileNotFoundError as e:
-            print(e)
-            return []
-
-
-GALLERY_TEMPLATE = """
-.. raw:: html
-
-    <div class="sphx-glr-thumbcontainer" tooltip="{tooltip}">
-
-.. only:: html
-
-    .. figure:: {thumbnail}
-
-        {description}
-
-.. raw:: html
-
-    </div>
-"""
-
-
-class CustomGalleryItemDirective(Directive):
-    """Create a sphinx gallery style thumbnail.
-
-    tooltip and figure are self explanatory. Description could be a link to
-    a document like in below example. 
-
-    Example usage:
-
-    .. customgalleryitem::
-        :tooltip: I am writing this tutorial to focus specifically on NLP for people who have never written code in any deep learning framework
-        :figure: /_static/img/thumbnails/babel.jpg
-        :description: :doc:`/beginner/deep_learning_nlp_tutorial`
-
-    If figure is specified, a thumbnail will be made out of it and stored in
-    _static/thumbs. Therefore, consider _static/thumbs as a 'built' directory.
-    """
-
-    required_arguments = 0
-    optional_arguments = 0
-    final_argument_whitespace = True
-    option_spec = {'tooltip': directives.unchanged,
-                   'figure': directives.unchanged,
-                   'description': directives.unchanged}
-
-    has_content = False
-    add_index = False
-
-    def run(self):
-        try:
-            if 'tooltip' in self.options:
-                tooltip = self.options['tooltip'][:195] + '...'
-            else:
-                raise ValueError('tooltip not found')
-
-            if 'figure' in self.options:
-                env = self.state.document.settings.env
-                rel_figname, figname = env.relfn2path(self.options['figure'])
-                thumbnail = os.path.join('_static/thumbs/', os.path.basename(figname))
-
-                try:
-                    os.makedirs('_static/thumbs')
-                except FileExistsError:
-                    pass
-
-                sphinx_gallery.gen_rst.scale_image(figname, thumbnail, 400, 280)
-            else:
-                thumbnail = '_static/img/thumbnails/default.png'
-
-            if 'description' in self.options:
-                description = self.options['description']
-            else:
-                raise ValueError('description not doc found')
-
-        except FileNotFoundError as e:
-            print(e)
-            return []
-        except ValueError as e:
-            print(e)
-            raise
-            return []
-
-        thumbnail_rst = GALLERY_TEMPLATE.format(tooltip=tooltip,
-                                                thumbnail=thumbnail,
-                                                description=description)
-        thumbnail = StringList(thumbnail_rst.split('\n'))
-        thumb = nodes.paragraph()
-        self.state.nested_parse(thumbnail, self.content_offset, thumb)
-        return [thumb]
diff --git a/deep-dive.rst b/deep-dive.rst
new file mode 100644
index 00000000000..89a18d48e29
--- /dev/null
+++ b/deep-dive.rst
@@ -0,0 +1,163 @@
+:orphan:
+
+Deep Dive
+=========
+
+Focused on enhancing model performance, this section includes
+tutorials on profiling, hyperparameter tuning, quantization,
+and other techniques to optimize PyTorch models for better efficiency
+and speed.
+
+.. raw:: html
+
+    <div id="tutorial-cards-container">
+
+    <nav class="navbar navbar-expand-lg navbar-light tutorials-nav col-12">
+        <div class="tutorial-tags-container">
+            <div id="dropdown-filter-tags">
+                <div class="tutorial-filter-menu">
+                    <div class="tutorial-filter filter-btn all-tag-selected" data-tag="all">All</div>
+                </div>
+            </div>
+        </div>
+    </nav>
+
+    <hr class="tutorials-hr">
+
+    <div class="row">
+
+    <div id="tutorial-cards">
+    <div class="list">
+
+.. Add tutorial cards below this line
+.. customcarditem::
+   :header: Profiling PyTorch
+   :card_description: Learn how to profile a PyTorch application
+   :link: beginner/profiler.html
+   :image: _static/img/thumbnails/cropped/pytorch-logo.png
+   :tags: Profiling
+
+.. customcarditem::
+   :header: Parametrizations Tutorial
+   :card_description: Learn how to use torch.nn.utils.parametrize to put constraints on your parameters (e.g. make them orthogonal, symmetric positive definite, low-rank...)
+   :image: _static/img/thumbnails/cropped/parametrizations.png
+   :link: intermediate/parametrizations.html
+   :tags: Model-Optimization,Best-Practice
+
+.. customcarditem::
+   :header: Pruning Tutorial
+   :card_description: Learn how to use torch.nn.utils.prune to sparsify your neural networks, and how to extend it to implement your own custom pruning technique.
+   :image: _static/img/thumbnails/cropped/Pruning-Tutorial.png
+   :link: intermediate/pruning_tutorial.html
+   :tags: Model-Optimization,Best-Practice
+
+
+.. customcarditem::
+   :header: Inductor CPU Backend Debugging and Profiling
+   :card_description: Learn the usage, debugging and performance profiling for ``torch.compile`` with Inductor CPU backend.
+   :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: intermediate/inductor_debug_cpu.html
+   :tags: Model-Optimization,inductor
+
+.. customcarditem::
+   :header: (beta) Implementing High-Performance Transformers with SCALED DOT PRODUCT ATTENTION
+   :card_description: This tutorial explores the new torch.nn.functional.scaled_dot_product_attention and how it can be used to construct Transformer components.
+   :image: _static/img/thumbnails/cropped/pytorch-logo.png
+   :link: intermediate/scaled_dot_product_attention_tutorial.html
+   :tags: Model-Optimization,Attention,Transformer
+
+.. customcarditem::
+   :header: Knowledge Distillation in Convolutional Neural Networks
+   :card_description:  Learn how to improve the accuracy of lightweight models using more powerful models as teachers.
+   :image: _static/img/thumbnails/cropped/knowledge_distillation_pytorch_logo.png
+   :link: beginner/knowledge_distillation_tutorial.html
+   :tags: Model-Optimization,Image/Video
+
+.. Frontend APIs
+.. customcarditem::
+   :header: (beta) Channels Last Memory Format in PyTorch
+   :card_description: Get an overview of Channels Last memory format and understand how it is used to order NCHW tensors in memory preserving dimensions.
+   :image: _static/img/thumbnails/cropped/experimental-Channels-Last-Memory-Format-in-PyTorch.png
+   :link: intermediate/memory_format_tutorial.html
+   :tags: Memory-Format,Best-Practice,Frontend-APIs
+
+.. customcarditem::
+   :header: Forward-mode Automatic Differentiation
+   :card_description: Learn how to use forward-mode automatic differentiation.
+   :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: intermediate/forward_ad_usage.html
+   :tags: Frontend-APIs
+
+.. customcarditem::
+   :header: Jacobians, Hessians, hvp, vhp, and more
+   :card_description: Learn how to compute advanced autodiff quantities using torch.func
+   :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: intermediate/jacobians_hessians.html
+   :tags: Frontend-APIs
+
+.. customcarditem::
+   :header: Model Ensembling
+   :card_description: Learn how to ensemble models using torch.vmap
+   :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: intermediate/ensembling.html
+   :tags: Frontend-APIs
+
+.. customcarditem::
+   :header: Per-Sample-Gradients
+   :card_description: Learn how to compute per-sample-gradients using torch.func
+   :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: intermediate/per_sample_grads.html
+   :tags: Frontend-APIs
+
+.. customcarditem::
+   :header: Neural Tangent Kernels
+   :card_description: Learn how to compute neural tangent kernels using torch.func
+   :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: intermediate/neural_tangent_kernels.html
+   :tags: Frontend-APIs
+
+.. customcarditem::
+   :header: Using the PyTorch C++ Frontend
+   :card_description: Walk through an end-to-end example of training a model with the C++ frontend by training a DCGAN – a kind of generative model – to generate images of MNIST digits.
+   :image: _static/img/thumbnails/cropped/Using-the-PyTorch-Cpp-Frontend.png
+   :link: advanced/cpp_frontend.html
+   :tags: Frontend-APIs,C++
+
+.. customcarditem::
+   :header: Autograd in C++ Frontend
+   :card_description: The autograd package helps build flexible and dynamic nerural netorks. In this tutorial, exploreseveral examples of doing autograd in PyTorch C++ frontend
+   :image: _static/img/thumbnails/cropped/Autograd-in-Cpp-Frontend.png
+   :link: advanced/cpp_autograd.html
+   :tags: Frontend-APIs,C++
+
+.. End of tutorial card section
+.. -----------------------------------------
+.. Page TOC
+.. -----------------------------------------
+.. toctree::
+   :maxdepth: 2
+   :includehidden:
+   :hidden:
+
+   beginner/profiler
+   beginner/vt_tutorial
+   intermediate/parametrizations
+   intermediate/pruning_tutorial
+   intermediate/inductor_debug_cpu
+   intermediate/scaled_dot_product_attention_tutorial
+   beginner/knowledge_distillation_tutorial
+
+.. toctree::
+   :maxdepth: 2
+   :includehidden:
+   :hidden:
+   :caption: Frontend APIs
+
+   intermediate/memory_format_tutorial
+   intermediate/forward_ad_usage
+   intermediate/jacobians_hessians
+   intermediate/ensembling
+   intermediate/per_sample_grads
+   intermediate/neural_tangent_kernels.py
+   advanced/cpp_frontend
+   advanced/cpp_autograd
diff --git a/distributed.rst b/distributed.rst
new file mode 100644
index 00000000000..af4d9772506
--- /dev/null
+++ b/distributed.rst
@@ -0,0 +1,197 @@
+Distributed
+===========
+
+Distributed training is a model training paradigm that involves
+spreading training workload across multiple worker nodes, therefore
+significantly improving the speed of training and model accuracy. While
+distributed training can be used for any type of ML model training, it
+is most beneficial to use it for large models and compute demanding
+tasks as deep learning.
+
+There are a few ways you can perform distributed training in
+PyTorch with each method having their advantages in certain use cases:
+
+* `DistributedDataParallel (DDP) <#learn-ddp>`__
+* `Fully Sharded Data Parallel (FSDP2) <#learn-fsdp>`__
+* `Tensor Parallel (TP) <#learn-tp>`__
+* `Device Mesh <#device-mesh>`__
+* `Remote Procedure Call (RPC) distributed training <#learn-rpc>`__
+* `Custom Extensions <#custom-extensions>`__
+
+Read more about these options in `Distributed Overview <https://docs.pytorch.org/tutorials/beginner/dist_overview.html?utm_source=distr_landing>`__.
+
+.. _learn-ddp:
+
+Learn DDP
+---------
+
+.. grid:: 3
+
+     .. grid-item-card:: :octicon:`file-code;1em`
+        DDP Intro Video Tutorials
+        :link: https://pytorch.org/tutorials/beginner/ddp_series_intro.html?utm_source=distr_landing&utm_medium=ddp_series_intro
+        :link-type: url
+
+        A step-by-step video series on how to get started with
+        `DistributedDataParallel` and advance to more complex topics
+        +++
+        :octicon:`code;1em` Code :octicon:`square-fill;1em` :octicon:`video;1em` Video
+
+     .. grid-item-card:: :octicon:`file-code;1em`
+        Getting Started with Distributed Data Parallel
+        :link: https://pytorch.org/tutorials/intermediate/ddp_tutorial.html?utm_source=distr_landing&utm_medium=intermediate_ddp_tutorial
+        :link-type: url
+
+        This tutorial provides a short and gentle intro to the PyTorch
+        DistributedData Parallel.
+        +++
+        :octicon:`code;1em` Code
+
+     .. grid-item-card:: :octicon:`file-code;1em`
+        Distributed Training with Uneven Inputs Using
+        the Join Context Manager
+        :link: https://pytorch.org/tutorials/advanced/generic_join.html?utm_source=distr_landing&utm_medium=generic_join
+        :link-type: url
+
+        This tutorial describes the Join context manager and
+        demonstrates it's use with DistributedData Parallel.
+        +++
+        :octicon:`code;1em` Code
+
+.. _learn-fsdp:
+
+Learn FSDP2
+----------
+
+.. grid:: 3
+
+     .. grid-item-card:: :octicon:`file-code;1em`
+        Getting Started with FSDP2
+        :link: https://pytorch.org/tutorials/intermediate/FSDP_tutorial.html?utm_source=distr_landing&utm_medium=FSDP_getting_started
+        :link-type: url
+
+        This tutorial demonstrates how you can perform distributed training
+        with FSDP2 on a transformer model
+        +++
+        :octicon:`code;1em` Code
+
+
+.. _learn-tp:
+
+Learn Tensor Parallel (TP)
+---------------
+
+.. grid:: 3
+
+     .. grid-item-card:: :octicon:`file-code;1em`
+        Large Scale Transformer model training with Tensor Parallel (TP)
+        :link: https://pytorch.org/tutorials/intermediate/TP_tutorial.html
+        :link-type: url
+
+        This tutorial demonstrates how to train a large Transformer-like model across hundreds to thousands of GPUs using Tensor Parallel and Fully Sharded Data Parallel.
+        +++
+        :octicon:`code;1em` Code
+
+
+.. _device-mesh:
+
+Learn DeviceMesh
+----------------
+
+.. grid:: 3
+
+     .. grid-item-card:: :octicon:`file-code;1em`
+        Getting Started with DeviceMesh
+        :link: https://pytorch.org/tutorials/recipes/distributed_device_mesh.html?highlight=devicemesh
+        :link-type: url
+
+        In this tutorial you will learn about `DeviceMesh`
+        and how it can help with distributed training.
+        +++
+        :octicon:`code;1em` Code
+
+.. _learn-rpc:
+
+Learn RPC
+---------
+
+.. grid:: 3
+
+     .. grid-item-card:: :octicon:`file-code;1em`
+        Getting Started with Distributed RPC Framework
+        :link: https://pytorch.org/tutorials/intermediate/rpc_tutorial.html?utm_source=distr_landing&utm_medium=rpc_getting_started
+        :link-type: url
+
+        This tutorial demonstrates how to get started with RPC-based distributed
+        training.
+        +++
+        :octicon:`code;1em` Code
+
+     .. grid-item-card:: :octicon:`file-code;1em`
+        Implementing a Parameter Server Using Distributed RPC Framework
+        :link: https://pytorch.org/tutorials/intermediate/rpc_param_server_tutorial.html?utm_source=distr_landing&utm_medium=rpc_param_server_tutorial
+        :link-type: url
+
+        This tutorial walks you through a simple example of implementing a
+        parameter server using PyTorch’s Distributed RPC framework.
+        +++
+        :octicon:`code;1em` Code
+
+     .. grid-item-card:: :octicon:`file-code;1em`
+        Implementing Batch RPC Processing Using Asynchronous Executions
+        :link: https://pytorch.org/tutorials/intermediate/rpc_async_execution.html?utm_source=distr_landing&utm_medium=rpc_async_execution
+        :link-type: url
+
+        In this tutorial you will build batch-processing RPC applications
+        with the @rpc.functions.async_execution decorator.
+        +++
+        :octicon:`code;1em` Code
+
+.. grid:: 3
+
+     .. grid-item-card:: :octicon:`file-code;1em`
+        Combining Distributed DataParallel with Distributed RPC Framework
+        :link: https://pytorch.org/tutorials/advanced/rpc_ddp_tutorial.html?utm_source=distr_landing&utm_medium=rpc_plus_ddp
+        :link-type: url
+
+        In this tutorial you will learn how to combine distributed data
+        parallelism with distributed model parallelism.
+        +++
+        :octicon:`code;1em` Code
+
+.. _custom-extensions:
+
+Custom Extensions
+-----------------
+
+.. grid:: 3
+
+     .. grid-item-card:: :octicon:`file-code;1em`
+        Customize Process Group Backends Using Cpp Extensions
+        :link: https://pytorch.org/tutorials/intermediate/process_group_cpp_extension_tutorial.html?utm_source=distr_landing&utm_medium=custom_extensions_cpp
+        :link-type: url
+
+        In this tutorial you will learn to implement a custom `ProcessGroup`
+        backend and plug that into PyTorch distributed package using
+        cpp extensions.
+        +++
+        :octicon:`code;1em` Code
+
+.. toctree::
+   :hidden:
+   :maxdepth: 2
+
+   beginner/dist_overview
+   beginner/ddp_series_intro
+   intermediate/ddp_tutorial
+   intermediate/dist_tuto
+   intermediate/FSDP_tutorial
+   intermediate/TCPStore_libuv_backend
+   intermediate/TP_tutorial
+   intermediate/pipelining_tutorial
+   intermediate/process_group_cpp_extension_tutorial
+   intermediate/rpc_tutorial
+   intermediate/rpc_param_server_tutorial
+   intermediate/rpc_async_execution
+   advanced/rpc_ddp_tutorial
+   advanced/generic_join
diff --git a/docathon-leaderboard.md b/docathon-leaderboard.md
new file mode 100644
index 00000000000..de12a13cca8
--- /dev/null
+++ b/docathon-leaderboard.md
@@ -0,0 +1,164 @@
+# 🎉 PyTorch Docathon Leaderboard 2025 🎉
+
+This is the list of the docathon contributors that have participated and contributed to the PyTorch H1 2025 docathon. A big shout out to everyone who have participated!
+We have awarded points for each merged PR as follows:
+
+* easy - 2 points
+* medium - 5 points
+* advanced - 10 points
+
+We have granted half points (1, 2, and 5 respectively) for all additional PRs merged against the same issue.
+In some cases, we have awarded credit for the PRs that were not merged or issues that have been closed without a merged PR.
+
+| Rank | Author | Points | PRs |
+|:---:|:------------|------:|:----|
+| 🥇 | [j-silv](https://github.com/j-silv) | 31 | [#155753](https://github.com/pytorch/pytorch/pull/155753), [#155659](https://github.com/pytorch/pytorch/pull/155659), [#155567](https://github.com/pytorch/pytorch/pull/155567), [#155540](https://github.com/pytorch/pytorch/pull/155540), [#155528](https://github.com/pytorch/pytorch/pull/155528), [#155198](https://github.com/pytorch/pytorch/pull/155198), [#155093](https://github.com/pytorch/pytorch/pull/155093), [#3389](https://github.com/pytorch/tutorials/pull/3389) |
+| 🥇 | [windsonsea](https://github.com/windsonsea) | 19 | [#155789](https://github.com/pytorch/pytorch/pull/155789), [#155520](https://github.com/pytorch/pytorch/pull/155520), [#156039](https://github.com/pytorch/pytorch/pull/156039), [#156009](https://github.com/pytorch/pytorch/pull/156009), [#155653](https://github.com/pytorch/pytorch/pull/155653) |
+| 🥇 | [kiszk](https://github.com/kiszk) | 16 | [#155762](https://github.com/pytorch/pytorch/pull/155762), [#155514](https://github.com/pytorch/pytorch/pull/155514), [#155351](https://github.com/pytorch/pytorch/pull/155351), [#155348](https://github.com/pytorch/pytorch/pull/155348), [#155347](https://github.com/pytorch/pytorch/pull/155347) |
+| 🥈 | [Rachel0619](https://github.com/Rachel0619) | 14 | [#155764](https://github.com/pytorch/pytorch/pull/155764), [#155482](https://github.com/pytorch/pytorch/pull/155482), [#3385](https://github.com/pytorch/tutorials/pull/3385), [#3381](https://github.com/pytorch/tutorials/pull/3381) |
+| 🥈 | [jafraustro](https://github.com/jafraustro) | 14 | [#155523](https://github.com/pytorch/pytorch/pull/155523), [#155369](https://github.com/pytorch/pytorch/pull/155369), [#133563](https://github.com/pytorch/pytorch/issues/133563), [#129446](https://github.com/pytorch/pytorch/issues/129446) |
+| 🥈 | [Dhia-naouali](https://github.com/Dhia-naouali) | 12 | [#155911](https://github.com/pytorch/pytorch/pull/155911), [#155840](https://github.com/pytorch/pytorch/pull/155840), [#155505](https://github.com/pytorch/pytorch/pull/155505) |
+| 🥈 | [loganthomas](https://github.com/loganthomas) | 12 | [#155702](https://github.com/pytorch/pytorch/pull/155702), [#155088](https://github.com/pytorch/pytorch/pull/155088), [#155649](https://github.com/pytorch/pytorch/pull/155649) |
+| 🥈 | [nirajkamal](https://github.com/nirajkamal) | 12 | [#155430](https://github.com/pytorch/pytorch/pull/155430), [#155228](https://github.com/pytorch/pytorch/pull/155228), [#3376](https://github.com/pytorch/tutorials/pull/3376) |
+| 🥉 | [Juliandlb](https://github.com/Juliandlb) | 10 | [#155987](https://github.com/pytorch/pytorch/pull/155987), [#155618](https://github.com/pytorch/pytorch/pull/155618) |
+| 🥉 | [ggsmith842](https://github.com/ggsmith842) | 7 | [#155767](https://github.com/pytorch/pytorch/pull/155767), [#155297](https://github.com/pytorch/pytorch/pull/155297) |
+| 🥉 | [ParagEkbote](https://github.com/ParagEkbote) | 7 | [#155683](https://github.com/pytorch/pytorch/pull/155683), [#155341](https://github.com/pytorch/pytorch/pull/155341) |
+| ⭐ | [GdoongMathew](https://github.com/GdoongMathew) | 5 | [#155813](https://github.com/pytorch/pytorch/pull/155813) |
+| ⭐ | [eromomon](https://github.com/eromomon) | 5 | [#155696](https://github.com/pytorch/pytorch/pull/155696) |
+| ⭐ | [dggaytan](https://github.com/dggaytan) | 5 | [#155377](https://github.com/pytorch/pytorch/pull/155377) |
+| ⭐ | [spzala](https://github.com/spzala) | 5 | [#155335](https://github.com/pytorch/pytorch/pull/155335) |
+| ⭐ | [framoncg](https://github.com/framoncg) | 5 | [#155298](https://github.com/pytorch/pytorch/pull/155298) |
+| ⭐ | [abhinav-TB](https://github.com/abhinav-TB) | 5 | [#155252](https://github.com/pytorch/pytorch/pull/155252) |
+| ⭐ | [aagalleg](https://github.com/aagalleg) | 5 | [#155137](https://github.com/pytorch/pytorch/pull/155137) |
+| ⭐ | [kiersten-stokes](https://github.com/kiersten-stokes) | 5 | [#155067](https://github.com/pytorch/pytorch/pull/155067) |
+| ⭐ | [krishnakalyan3](https://github.com/krishnakalyan3) | 5 | [#3387](https://github.com/pytorch/tutorials/pull/3387) |
+| ⭐ | [splion-360](https://github.com/splion-360) | 5 | [#3384](https://github.com/pytorch/tutorials/pull/3384) |
+| ⭐ | [harshaljanjani](https://github.com/harshaljanjani) | 5 | [#3377](https://github.com/pytorch/tutorials/pull/3377) |
+| ⭐ | [b-koopman](https://github.com/b-koopman) | 4 | [#155100](https://github.com/pytorch/pytorch/pull/155100), [#155889](https://github.com/pytorch/pytorch/pull/155889) |
+| ⭐ | [thatgeeman](https://github.com/thatgeeman) | 4 | [#155404](https://github.com/pytorch/pytorch/pull/155404), [#156094](https://github.com/pytorch/pytorch/pull/156094) |
+| ⭐ | [frost-intel](https://github.com/frost-intel) | 2 | [#3393](https://github.com/pytorch/tutorials/pull/3393) |
+| ⭐ | [ANotFox](https://github.com/ANotFox) | 2 | [#155148](https://github.com/pytorch/pytorch/pull/155148) |
+| ⭐ | [QasimKhan5x](https://github.com/QasimKhan5x) | 2 | [#155074](https://github.com/pytorch/pytorch/pull/155074) |
+| ⭐ | [Ashish-Soni08](https://github.com/Ashish-Soni08) | 2 | [#3379](https://github.com/pytorch/tutorials/pull/3379) |
+| ⭐ | [FORTFANOP](https://github.com/FORTFANOP) | 2 | [#3378](https://github.com/pytorch/tutorials/pull/3378) |
+| ⭐ | [newtdms ](https://github.com/newtdms ) | 2 | [#155497](https://github.com/pytorch/pytorch/pull/155497) |
+| ⭐ | [srini047](https://github.com/srini047) | 2 | [#155554](https://github.com/pytorch/pytorch/pull/155554) |
+
+
+# 🎉 Docathon H1 2024 Leaderboard 🎉
+
+This is the list of the docathon contributors that have participated and contributed to the PyTorch H1 2024 docathon.
+A big shout out to everyone who have participated! We have awarded points for each merged PR.
+For the **easy** label, we have awarded 2 points. For the **medium** label, we have awarded 5 points.
+For the **advanced** label, we have awarded 10 points. In some cases, we have awarded credit for the PRs that
+were not merged or issues that have been closed without a merged PR.
+
+| Author | Points | PR |
+|--- | --- | ---|
+| ahoblitz | 34 | https://github.com/pytorch/pytorch/pull/128566, https://github.com/pytorch/pytorch/pull/128408, https://github.com/pytorch/pytorch/pull/128171, https://github.com/pytorch/pytorch/pull/128083, https://github.com/pytorch/pytorch/pull/128082, https://github.com/pytorch/pytorch/pull/127983, https://github.com/pytorch/xla/pull/7214 |
+| afrittoli | 25 | https://github.com/pytorch/pytorch/pull/128139, https://github.com/pytorch/pytorch/pull/128133, https://github.com/pytorch/pytorch/pull/128132, https://github.com/pytorch/pytorch/pull/128129, https://github.com/pytorch/pytorch/pull/128127 |
+| kiszk | 20 | https://github.com/pytorch/pytorch/pull/128337, https://github.com/pytorch/pytorch/pull/128123, https://github.com/pytorch/pytorch/pull/128022, https://github.com/pytorch/pytorch/pull/128312 |
+| loganthomas | 19 | https://github.com/pytorch/pytorch/pull/128676, https://github.com/pytorch/pytorch/pull/128192, https://github.com/pytorch/pytorch/pull/128189, https://github.com/pytorch/tutorials/pull/2922, https://github.com/pytorch/tutorials/pull/2910, https://github.com/pytorch/xla/pull/7195 |
+| ignaciobartol | 17 | https://github.com/pytorch/pytorch/pull/128741, https://github.com/pytorch/pytorch/pull/128135, https://github.com/pytorch/pytorch/pull/127938, https://github.com/pytorch/tutorials/pull/2936 |
+| arunppsg | 17 | https://github.com/pytorch/pytorch/pull/128391, https://github.com/pytorch/pytorch/pull/128021, https://github.com/pytorch/pytorch/pull/128018, https://github.com/meta-pytorch/torchfix/pull/59 |
+| alperenunlu | 17 | https://github.com/pytorch/tutorials/pull/2934, https://github.com/pytorch/tutorials/pull/2909, https://github.com/pytorch/pytorch/pull/104043 |
+| anandptl84 | 10 | https://github.com/pytorch/pytorch/pull/128196, https://github.com/pytorch/pytorch/pull/128098 |
+| GdoongMathew | 10 | https://github.com/pytorch/pytorch/pull/128136, https://github.com/pytorch/pytorch/pull/128051 |
+| ZhaoqiongZ | 10 | https://github.com/pytorch/pytorch/pull/127872 |
+| ZailiWang | 10 | https://github.com/pytorch/tutorials/pull/2931 |
+| jingxu10 | 8 | https://github.com/pytorch/pytorch/pull/127280, https://github.com/pytorch/pytorch/pull/127279, https://github.com/pytorch/pytorch/pull/127278, https://github.com/pytorch/tutorials/pull/2919 |
+| sitamgithub-MSIT | 7 | https://github.com/pytorch/tutorials/pull/2900, https://github.com/pytorch/xla/pull/7208 |
+| spzala | 5 | https://github.com/pytorch/pytorch/pull/128679, https://github.com/pytorch/pytorch/pull/128657 |
+| TharinduRusira | 5 | https://github.com/pytorch/pytorch/pull/128197 |
+| zabboud | 5 | https://github.com/pytorch/pytorch/pull/128055 |
+| orion160 | 5 | https://github.com/pytorch/tutorials/pull/2912 |
+| Ricktho1 | 5 | https://github.com/pytorch/xla/pull/7273 |
+| IvanLauLinTiong | 4 | https://github.com/pytorch/pytorch/pull/128526, https://github.com/pytorch/tutorials/pull/2849 |
+| sshkhr | 2 | https://github.com/pytorch/pytorch/pull/128155 |
+| rk7697 | 2 | https://github.com/pytorch/pytorch/pull/127993 |
+| hippocookie | 2 | https://github.com/pytorch/tutorials/pull/2937 |
+| diningeachox | 2 | https://github.com/pytorch/tutorials/pull/2935 |
+| akhil-maker | 2 | https://github.com/pytorch/tutorials/pull/2899 |
+| saurabhkthakur | 2 | https://github.com/pytorch/tutorials/pull/2896 |
+
+# 🎉 Docathon H2 2023 Leaderboard 🎉
+
+This is the list of the docathon contributors that have participated and contributed to the H2 2023 PyTorch docathon.
+A big shout out to everyone who have participated! We have awarded points for each merged PR.
+For the **easy** label, we have awarded 2 points. For the **medium** label, we have awarded 5 points.
+For the **advanced** label, we have awarded 10 points. In some cases, we have awarded half credit for the PRs that
+were not merged or issues that have been closed without a merged PR. Thank you all for your awesome contributions! 🎉
+
+| Author | Points | PR |
+|--- | --- | ---|
+| ahoblitz | 25 | https://github.com/pytorch/pytorch/pull/112992, https://github.com/pytorch/tutorials/pull/2662, https://github.com/pytorch/tutorials/pull/2647, https://github.com/pytorch/tutorials/pull/2642, https://github.com/pytorch/tutorials/pull/2640, https://github.com/pytorch/pytorch/pull/113092, https://github.com/pytorch/pytorch/pull/113348 |
+| ChanBong | 22 | https://github.com/pytorch/pytorch/pull/113337, https://github.com/pytorch/pytorch/pull/113336, https://github.com/pytorch/pytorch/pull/113335, https://github.com/pytorch/tutorials/pull/2644, https://github.com/pytorch/tutorials/pull/2639 |
+| alperenunlu | 22 | https://github.com/pytorch/pytorch/pull/113260, https://github.com/pytorch/tutorials/pull/2673, https://github.com/pytorch/tutorials/pull/2660, https://github.com/pytorch/tutorials/pull/2656, https://github.com/pytorch/tutorials/pull/2649, https://github.com/pytorch/pytorch/pull/113505, https://github.com/pytorch/pytorch/pull/113218, https://github.com/pytorch/pytorch/pull/113505 |
+| spzala | 22 | https://github.com/pytorch/pytorch/pull/113200, https://github.com/pytorch/pytorch/pull/112693, https://github.com/pytorch/tutorials/pull/2667, https://github.com/pytorch/tutorials/pull/2635 |
+| bjhargrave | 21 | https://github.com/pytorch/pytorch/pull/113358, https://github.com/pytorch/pytorch/pull/113206, https://github.com/pytorch/pytorch/pull/112786, https://github.com/pytorch/tutorials/pull/2661, https://github.com/pytorch/tutorials/pull/1272 |
+| zabboud | 21 | https://github.com/pytorch/pytorch/pull/113233, https://github.com/pytorch/pytorch/pull/113227, https://github.com/pytorch/pytorch/pull/113177, https://github.com/pytorch/pytorch/pull/113219, https://github.com/pytorch/pytorch/pull/113311 |
+| nvs-abhilash | 20 | https://github.com/pytorch/pytorch/pull/113241, https://github.com/pytorch/pytorch/pull/112765, https://github.com/pytorch/pytorch/pull/112695, https://github.com/pytorch/pytorch/pull/112657 |
+| guptaaryan16 | 19 | https://github.com/pytorch/pytorch/pull/112817, https://github.com/pytorch/pytorch/pull/112735, https://github.com/pytorch/tutorials/pull/2674, https://github.com/pytorch/pytorch/pull/113196, https://github.com/pytorch/pytorch/pull/113532 |
+| min-jean-cho | 17 | https://github.com/pytorch/pytorch/pull/113195, https://github.com/pytorch/pytorch/pull/113183, https://github.com/pytorch/pytorch/pull/113178, https://github.com/pytorch/pytorch/pull/113109, https://github.com/pytorch/pytorch/pull/112892 |
+| markstur | 14 | https://github.com/pytorch/pytorch/pull/113250, https://github.com/pytorch/tutorials/pull/2643, https://github.com/pytorch/tutorials/pull/2638, https://github.com/pytorch/tutorials/pull/2636 |
+| RustyGrackle | 13 | https://github.com/pytorch/pytorch/pull/113371, https://github.com/pytorch/pytorch/pull/113266, https://github.com/pytorch/pytorch/pull/113435 |
+| Viditagarwal7479 | 12 | https://github.com/pytorch/pytorch/pull/112860, https://github.com/pytorch/tutorials/pull/2659, https://github.com/pytorch/tutorials/pull/2671 |
+| kiszk | 10 | https://github.com/pytorch/pytorch/pull/113523, https://github.com/pytorch/pytorch/pull/112751 |
+| awaelchli | 10 | https://github.com/pytorch/pytorch/pull/113216, https://github.com/pytorch/pytorch/pull/112674 |
+| pilot-j | 10 | https://github.com/pytorch/pytorch/pull/112964, https://github.com/pytorch/pytorch/pull/112856 |
+| krishnakalyan3 | 7 | https://github.com/pytorch/tutorials/pull/2653, https://github.com/pytorch/tutorials/pull/1235, https://github.com/pytorch/tutorials/pull/1705 |
+| ash-01xor | 5 | https://github.com/pytorch/pytorch/pull/113511 |
+| IvanLauLinTiong | 5 | https://github.com/pytorch/pytorch/pull/113052 |
+| Senthi1Kumar | 5 | https://github.com/pytorch/pytorch/pull/113021 |
+| ooooo-create | 5 | https://github.com/pytorch/pytorch/pull/112953 |
+| stanleyedward | 5 | https://github.com/pytorch/pytorch/pull/112864, https://github.com/pytorch/pytorch/pull/112617 |
+| leslie-fang-intel | 5 | https://github.com/pytorch/tutorials/pull/2668 |
+| measty | 5 | https://github.com/pytorch/tutorials/pull/2675 |
+| Hhhhhhao | 5 | https://github.com/pytorch/tutorials/pull/2676 |
+| andrewashere | 3 | https://github.com/pytorch/pytorch/pull/112721 |
+| aalhendi | 3 | https://github.com/pytorch/pytorch/pull/112947 |
+| sitamgithub-MSIT | 3 | https://github.com/pytorch/pytorch/pull/113264 |
+| Jarlaze | 3 | https://github.com/pytorch/pytorch/pull/113531 |
+| jingxu10 | 2 | https://github.com/pytorch/tutorials/pull/2657 |
+| cirquit | 2 | https://github.com/pytorch/tutorials/pull/2529 |
+| prithviraj-maurya | 1 | https://github.com/pytorch/tutorials/pull/2652 |
+| MirMustafaAli | 1 | https://github.com/pytorch/tutorials/pull/2645 |
+
+# 🎉 Docathon H1 2023 Leaderboard 🎉
+This is the list of the docathon contributors that have participated and contributed to the PyTorch docathon.
+A big shout out to everyone who have participated! We have awarded points for each merged PR.
+For the **easy** label, we have awarded 2 points. For the **medium** label, we have awarded 5 points.
+For the **advanced** label, we have awarded 10 points. In some cases, we have awarded half credit for the PRs that
+were not merged or issues that have been closed without a merged PR.
+| Author | Points | PR |
+|--- | --- | ---|
+| JoseLuisC99 | 22 | https://github.com/pytorch/tutorials/pull/2468, https://github.com/pytorch/tutorials/pull/2404, https://github.com/pytorch/tutorials/pull/2403, https://github.com/pytorch/tutorials/pull/2372, https://github.com/pytorch/examples/pull/1163, https://github.com/pytorch/tutorials/pull/2432 |
+| QasimKhan5x | 21 | https://github.com/pytorch/tutorials/pull/2452, https://github.com/pytorch/tutorials/pull/2419, https://github.com/pytorch/tutorials/pull/2408, https://github.com/pytorch/tutorials/pull/2397, https://github.com/pytorch/tutorials/pull/2385, https://github.com/pytorch/tutorials/pull/2383 |
+| bjhargrave | 12 | https://github.com/pytorch/tutorials/pull/2428, https://github.com/pytorch/tutorials/pull/2424, https://github.com/pytorch/tutorials/pull/2423 |
+| Aidyn-A | 10 | https://github.com/pytorch/tutorials/pull/2441 |
+| CaoE | 10 | https://github.com/pytorch/tutorials/pull/2439 |
+| HemanthSai7 | 10 | https://github.com/pytorch/tutorials/pull/2392, https://github.com/pytorch/tutorials/pull/2375 |
+| leslie-fang-intel | 10 | https://github.com/pytorch/tutorials/pull/2354 |
+| Valentine233 | 10 | https://github.com/pytorch/tutorials/pull/2430 |
+| TheMemoryDealer | 9 | https://github.com/pytorch/tutorials/pull/2389, https://github.com/pytorch/tutorials/pull/2369, https://github.com/pytorch/tutorials/pull/2367 |
+| arunppsg | 8 | https://github.com/pytorch/tutorials/pull/2384, https://github.com/pytorch/tutorials/pull/821 |
+| noqqaqq | 7 | https://github.com/pytorch/tutorials/pull/2407, https://github.com/pytorch/tutorials/pull/2386 |
+| zabboud | 7 | https://github.com/pytorch/tutorials/pull/2405, https://github.com/pytorch/tutorials/pull/2400 |
+| kiersten-stokes | 7 | https://github.com/pytorch/tutorials/pull/2401, https://github.com/pytorch/tutorials/pull/2398 |
+| frasertajima | 6 | https://github.com/pytorch/tutorials/pull/2370, https://github.com/pytorch/tutorials/pull/2368, https://github.com/pytorch/tutorials/pull/2363 |
+| nairbv | 5 | https://github.com/pytorch/tutorials/pull/2413 |
+| mikebrow | 5 | https://github.com/pytorch/tutorials/pull/2374 |
+| NeoKish | 4 | https://github.com/pytorch/tutorials/pull/2364, https://github.com/pytorch/tutorials/pull/2361 |
+| fabiogomez11c | 3 | https://github.com/pytorch/tutorials/pull/2362, https://github.com/pytorch/tutorials/pull/1011 |
+| onurtore | 2 | https://github.com/pytorch/tutorials/pull/2458 |
+| NM512 | 2 | https://github.com/pytorch/tutorials/pull/2451 |
+| j3soon | 2 | https://github.com/pytorch/tutorials/pull/2420 |
+| Samsonboadi | 2 | https://github.com/pytorch/tutorials/pull/2406 |
+| neuralninja27 | 2 | https://github.com/pytorch/tutorials/pull/2381 |
+| akjalok | 2 | https://github.com/pytorch/tutorials/pull/2380 |
+| tcNickolas | 2 | https://github.com/pytorch/tutorials/pull/2378 |
+| Suhas-G | 2 | https://github.com/pytorch/tutorials/pull/2371 |
+| BeniaminC | 2 | https://github.com/pytorch/tutorials/pull/2366 |
+| ver2king | 2 | https://github.com/pytorch/tutorials/pull/2445, https://github.com/pytorch/tutorials/pull/2459 |
+| mikgor | 1 | https://github.com/pytorch/tutorials/pull/2417 |
+| spzala | 1 | https://github.com/pytorch/tutorials/pull/1579 |
diff --git a/domains.rst b/domains.rst
new file mode 100644
index 00000000000..fbc5fcdedd7
--- /dev/null
+++ b/domains.rst
@@ -0,0 +1,163 @@
+:orphan:
+
+Domains
+=======
+
+This section contains specialized tutorials focused on applying
+PyTorch to specific application areas. These guides demonstrate
+how to use domain-specific libraries like torchvision, torchaudio, and
+others. This section is for developers looking to implement PyTorch
+in particular fields of deep learning.
+
+.. raw:: html
+
+   <div id="tutorial-cards-container">
+
+    <nav class="navbar navbar-expand-lg navbar-light tutorials-nav col-12">
+        <div class="tutorial-tags-container">
+            <div id="dropdown-filter-tags">
+                <div class="tutorial-filter-menu">
+                    <div class="tutorial-filter filter-btn all-tag-selected" data-tag="all">All</div>
+                </div>
+            </div>
+        </div>
+    </nav>
+
+    <hr class="tutorials-hr">
+
+    <div class="row">
+
+    <div id="tutorial-cards">
+    <div class="list">
+
+.. Add cards below this line
+.. customcarditem::
+   :header: TorchVision Object Detection Finetuning Tutorial
+   :card_description: Finetune a pre-trained Mask R-CNN model.
+   :image: _static/img/thumbnails/cropped/TorchVision-Object-Detection-Finetuning-Tutorial.png
+   :link: intermediate/torchvision_tutorial.html
+   :tags: Image/Video
+
+.. customcarditem::
+   :header: Transfer Learning for Computer Vision Tutorial
+   :card_description: Train a convolutional neural network for image classification using transfer learning.
+   :image: _static/img/thumbnails/cropped/Transfer-Learning-for-Computer-Vision-Tutorial.png
+   :link: beginner/transfer_learning_tutorial.html
+   :tags: Image/Video
+
+.. customcarditem::
+   :header: Adversarial Example Generation
+   :card_description: Train a convolutional neural network for image classification using transfer learning.
+   :image: _static/img/thumbnails/cropped/Adversarial-Example-Generation.png
+   :link: beginner/fgsm_tutorial.html
+   :tags: Image/Video
+
+.. customcarditem::
+   :header: DCGAN Tutorial
+   :card_description: Train a generative adversarial network (GAN) to generate new celebrities.
+   :image: _static/img/thumbnails/cropped/DCGAN-Tutorial.png
+   :link: beginner/dcgan_faces_tutorial.html
+   :tags: Image/Video
+
+.. customcarditem::
+   :header: Spatial Transformer Networks Tutorial
+   :card_description: Learn how to augment your network using a visual attention mechanism.
+   :image: _static/img/stn/Five.gif
+   :link: intermediate/spatial_transformer_tutorial.html
+   :tags: Image/Video
+
+.. customcarditem::
+   :header: Semi-Supervised Learning Tutorial Based on USB
+   :card_description: Learn how to train semi-supervised learning algorithms (on custom data) using USB and PyTorch.
+   :image: _static/img/usb_semisup_learn/code.png
+   :link: advanced/usb_semisup_learn.html
+   :tags: Image/Video
+
+.. Reinforcement Learning
+.. customcarditem::
+   :header: Reinforcement Learning (DQN)
+   :card_description: Learn how to use PyTorch to train a Deep Q Learning (DQN) agent on the CartPole-v0 task from the OpenAI Gym.
+   :image: _static/img/cartpole.gif
+   :link: intermediate/reinforcement_q_learning.html
+   :tags: Reinforcement-Learning
+
+.. customcarditem::
+   :header: Reinforcement Learning (PPO) with TorchRL
+   :card_description: Learn how to use PyTorch and TorchRL to train a Proximal Policy Optimization agent on the Inverted Pendulum task from Gym.
+   :image: _static/img/invpendulum.gif
+   :link: intermediate/reinforcement_ppo.html
+   :tags: Reinforcement-Learning
+
+.. customcarditem::
+   :header: Train a Mario-playing RL Agent
+   :card_description: Use PyTorch to train a Double Q-learning agent to play Mario.
+   :image: _static/img/mario.gif
+   :link: intermediate/mario_rl_tutorial.html
+   :tags: Reinforcement-Learning
+
+.. customcarditem::
+   :header: Recurrent DQN
+   :card_description: Use TorchRL to train recurrent policies
+   :image: _static/img/rollout_recurrent.png
+   :link: intermediate/dqn_with_rnn_tutorial.html
+   :tags: Reinforcement-Learning
+
+.. customcarditem::
+   :header: Code a DDPG Loss
+   :card_description: Use TorchRL to code a DDPG Loss
+   :image: _static/img/half_cheetah.gif
+   :link: advanced/coding_ddpg.html
+   :tags: Reinforcement-Learning
+
+.. customcarditem::
+   :header: Writing your environment and transforms
+   :card_description: Use TorchRL to code a Pendulum
+   :image: _static/img/pendulum.gif
+   :link: advanced/pendulum.html
+   :tags: Reinforcement-Learning
+
+.. -----------------------------------------
+.. Page TOC
+.. -----------------------------------------
+.. toctree::
+   :maxdepth: 1
+   :includehidden:
+   :hidden:
+   :caption: Image and Video
+
+   intermediate/torchvision_tutorial
+   beginner/transfer_learning_tutorial
+   beginner/fgsm_tutorial
+   beginner/dcgan_faces_tutorial
+   intermediate/spatial_transformer_tutorial
+
+.. toctree::
+   :maxdepth: 2
+   :includehidden:
+   :hidden:
+   :caption: Reinforcement Learning
+
+   intermediate/reinforcement_q_learning
+   intermediate/reinforcement_ppo
+   intermediate/dqn_with_rnn_tutorial.html
+   intermediate/mario_rl_tutorial
+   advanced/pendulum
+   advanced/coding_ddpg.html
+
+.. toctree::
+   :maxdepth: 2
+   :includehidden:
+   :hidden:
+   :caption: Recommendation Systems
+
+   intermediate/torchrec_intro_tutorial
+   advanced/sharding
+
+.. toctree::
+   :maxdepth: 2
+   :includehidden:
+   :hidden:
+   :caption: Other Domains
+
+   See Audio tutorials on the audio website <https://pytorch.org/audio/stable/index.html>
+   See ExecuTorch tutorials on the ExecuTorch website <https://pytorch.org/executorch/stable/index.html>
diff --git a/ecosystem.rst b/ecosystem.rst
new file mode 100644
index 00000000000..da2a926851a
--- /dev/null
+++ b/ecosystem.rst
@@ -0,0 +1,70 @@
+Ecosystem
+=========
+
+Explore tutorials that cover tools and frameworks in
+the PyTorch ecosystem. These practical guides will help you leverage
+PyTorch's extensive ecosystem for everything from experimentation
+to production deployment.
+
+.. raw:: html
+
+   <div id="tutorial-cards-container">
+
+    <nav class="navbar navbar-expand-lg navbar-light tutorials-nav col-12">
+        <div class="tutorial-tags-container">
+            <div id="dropdown-filter-tags">
+                <div class="tutorial-filter-menu">
+                    <div class="tutorial-filter filter-btn all-tag-selected" data-tag="all">All</div>
+                </div>
+            </div>
+        </div>
+    </nav>
+
+    <hr class="tutorials-hr">
+
+    <div class="row">
+
+    <div id="tutorial-cards">
+    <div class="list">
+
+.. Add tutorial cards below this line
+.. customcarditem::
+   :header: Hyperparameter Tuning Tutorial
+   :card_description: Learn how to use Ray Tune to find the best performing set of hyperparameters for your model.
+   :image: _static/img/ray-tune.png
+   :link: beginner/hyperparameter_tuning_tutorial.html
+   :tags: Model-Optimization,Best-Practice,Ecosystem
+
+.. customcarditem::
+   :header: Multi-Objective Neural Architecture Search with Ax
+   :card_description: Learn how to use Ax to search over architectures find optimal tradeoffs between accuracy and latency.
+   :image: _static/img/ax_logo.png
+   :link: intermediate/ax_multiobjective_nas_tutorial.html
+   :tags: Model-Optimization,Best-Practice,Ax,TorchX,Ecosystem
+
+.. customcarditem::
+   :header: Performance Profiling in TensorBoard
+   :card_description: Learn how to use the TensorBoard plugin to profile and analyze your model's performance.
+   :image: _static/img/thumbnails/cropped/profiler.png
+   :link: intermediate/tensorboard_profiler_tutorial.html
+   :tags: Model-Optimization,Best-Practice,Profiling,TensorBoard,Ecosystem
+
+.. customcarditem::
+   :header: Real Time Inference on Raspberry Pi 4
+   :card_description: This tutorial covers how to run quantized and fused models on a Raspberry Pi 4 at 30 fps.
+   :image: _static/img/thumbnails/cropped/realtime_rpi.png
+   :link: intermediate/realtime_rpi.html
+   :tags: Model-Optimization,Image/Video,Quantization,Ecosystem
+
+.. End of tutorial card section
+.. -----------------------------------------
+.. Page TOC
+.. -----------------------------------------
+.. toctree::
+   :maxdepth: 2
+   :hidden:
+
+   beginner/hyperparameter_tuning_tutorial
+   intermediate/ax_multiobjective_nas_tutorial
+   intermediate/tensorboard_profiler_tutorial
+   intermediate/realtime_rpi
diff --git a/en-wordlist.txt b/en-wordlist.txt
new file mode 100644
index 00000000000..baf75d75ac0
--- /dev/null
+++ b/en-wordlist.txt
@@ -0,0 +1,711 @@
+ACL
+ADI
+ALiBi
+AOT
+AOTInductor
+APIs
+ATen
+AVX
+Args
+Autograd
+BCE
+BFGS
+BLAS
+BLEU
+BN
+BOS
+BT
+Backpropagating
+Bahdanau
+BatchNorm
+Bethge
+Bfloat
+CHW
+CIFAR
+CLS
+CNNDM
+CNNs
+CPUs
+CPython
+CUDA
+Caffe
+Captum
+Captum's
+CartPole
+Cayley
+CharRNN
+Chatbots
+Chen
+Colab
+Colorectal
+Composibility
+Conda
+Conv
+ConvNet
+ConvNets
+Cuda
+CudaLaunchKernel
+CudaMemcpyAsync
+CudaMemsetAsync
+DCGAN
+DCGANs
+DDP
+DDPG
+DDQN
+DLRM
+DMA
+DNN
+DQN
+DataLoaders
+DataPipe
+DataPipes
+Decompositions
+DeepLabV
+DeepMind
+DeiT
+DenseNet
+DeviceMesh
+Dialogs
+DyNet
+EOS
+EPS
+Ecker
+ExecuTorch
+ExportDB
+FC
+FGSM
+tensordict
+DataLoader's
+FLAVA
+FSDP
+FX
+FX's
+FairSeq
+Fastpath
+FakeTensor
+FakeTensors
+FFN
+FloydHub
+FloydHub's
+Frobenius
+Frontend
+GAE
+GAN
+GANs
+GEMM
+GLOO
+GPT
+GPU's
+GPUs
+GRU
+GRUs
+GTC
+Gatys
+GeForce
+Goodfellow
+Goodfellow’s
+Graviton
+GreedySearchDecoder
+HTA
+HVP
+Hao
+HistoEnc
+HistoEncoder
+HSDP
+Hugging Face
+IMDB
+IOT
+IR
+IRs
+ISA
+ITERS
+ImageNet
+Inductor
+Initializations
+Interpretability
+Iteratively
+JSON
+JVP
+Jacobian
+Joona
+Kather
+Keras
+Kihyuk
+Kiuk
+Kubernetes
+Kuei
+KV
+LRSchedulers
+LSTM
+LSTMs
+LYM
+LeCun
+LeNet
+LeakyReLU
+LeakyReLUs
+Lipschitz
+LoRa
+Lua
+Luong
+MHA
+MKLDNN
+MLP
+MLPs
+MNIST
+MPS
+MUC
+MacBook
+MacOS
+MaskRCNN
+Minifier
+MobileNet
+ModelABC
+MPS
+MTIA
+Mypy
+NAS
+NCCL
+NCHW
+NDK
+NES
+NLP
+NTK
+NUMA
+NaN
+NaNs
+NanoGPT
+Netron
+NeurIPS
+Norvig
+NumPy
+Numericalization
+Numpy's
+ONNX
+ONNX Runtime
+ONNX Script
+ONNX's
+OOM
+OU
+OpenAI
+OpenBLAS
+OpenJpeg
+OpenMP
+OpenSlide
+Opset
+Ornstein
+PIL
+PIL's
+PPO
+PatchPredictor
+PennFudan
+Perfetto
+Pixman
+Plotly
+Pohjonen
+Prec
+Profiler
+PyTorch's
+QuickStart
+RCNN
+RGB
+RGBA
+RL
+RNN
+RNNs
+ROCm
+RPC
+RPN
+RRef
+RTX
+Radford
+ReLU
+ReLUs
+ResNeXt
+ResNet
+Resampling
+Runtime's
+SDPA
+SGD
+SIMD
+SPD
+SQuAD
+SSD
+SSL
+SST2
+STN
+STR
+SVE
+SciPy
+Sequentials
+Sharding
+Sigmoid
+SoTA
+Sohn
+Spacy
+SwiGLU
+TCP
+THP
+TIAToolbox
+TODO
+TPU
+TensorBoard
+TensorBoards
+TensorDict
+TensorFloat
+TextVQA
+Theano
+Tokenization
+TorchDynamo
+TorchInductor
+TorchMultimodal
+TorchRL
+TorchRL's
+TorchScript
+TorchVision
+TorchVision
+TorchX
+Tunable
+UI
+UMAP
+Uhlenbeck
+Unescape
+VGG
+VLDB
+VQA
+VS Code
+ViT
+Volterra
+WMT
+WSI
+WSIs
+WSI’s
+Wang
+Wikitext
+Woohoo
+XLM
+Xception
+Xcode
+Xeon
+Yidong
+YouTube
+Zipf
+accelerometer
+accuracies
+activations
+adversarially
+affine
+al
+allocator
+allocator's
+allocators
+approximators
+autodiff
+autoencoder
+autograd
+autotune
+autotuner
+backend
+backends
+backprop
+backpropagate
+backpropagated
+backpropagates
+backpropagation
+backtrace
+batchnorm
+batchnorm's
+bbAP
+benchmarked
+benchmarking
+bitwise
+bool
+boolean
+breakpoint
+broadcasted
+bw
+bytecode
+callable's
+cancelation
+cardinality
+chatbot
+chatbot's
+checkpointing
+chroma
+codegen
+colorbar
+compilable
+composable
+composability
+concat
+conda
+config
+configs
+contrastive
+conv
+convolutional
+cpu
+csv
+cuDNN
+cuda
+customizable
+customizations
+datafile
+dataflow
+dataframe
+dataloader
+dataloaders
+datapipes
+dataset
+datasets
+dataset’s
+deallocation
+decompositions
+decorrelated
+deserialize
+deserialized
+desynchronization
+deterministically
+devicemesh
+dimensionality
+dir
+discontiguous
+distractor
+downsample
+downsamples
+dropdown
+dtensor
+dtype
+dtypes
+duration
+elementwise
+embeddings
+encodings
+enqueuing
+ensembling
+enum
+eq
+equalities
+et
+eval
+evaluateInput
+extensibility
+fastai
+fastmath
+fastpath
+fbgemm
+feedforward
+finetune
+finetuning
+FlexAttention
+fp
+frontend
+functionalized
+functionalizes
+functionalization
+functorch
+fuser
+geomean
+globals
+grayscale
+html
+handoff
+hardcode
+helpdesk
+helpdesks
+hessian
+hessians
+histoencoder
+histologically
+homonymous
+hotspot
+hvp
+hyperparameter
+hyperparameters
+imagenet
+img
+inductor
+inferencing
+initializations
+inlined
+interpretable
+intra
+invariance
+io
+iter
+iterable
+iteratively
+jacobian
+jacobians
+jit
+jitter
+jpg
+json
+judgements
+jupyter
+kernels
+keypoint
+kwargs
+labelled
+latencies
+learnable
+learnings
+linearities
+loadFilename
+logits
+mAP
+macos
+manualSeed
+matmul
+matmuls
+matplotlib
+memcpy
+memset
+minibatch
+minibatches
+minified
+minifying
+minimax
+misclassification
+misclassified
+modularity
+modularized
+mpp
+mucosa
+multihead
+MultiheadAttention
+multimodal
+multimodality
+multinode
+multiobjective
+multiprocessed
+multithreaded
+multithreading
+namespace
+natively
+ndarrays
+nheads
+nightlies
+NJT
+NJTs
+NJT's
+num
+numericalize
+numpy
+nvFuser
+nvFuser's
+ok
+oneDNN
+opset
+optimizable
+optimizer's
+optimizers
+otsu
+overfitting
+pageable
+parallelizable
+parallelization
+parametrization
+parametrizations
+parametrized
+parametrizing
+perceptibility
+performant
+pickleable
+pipelining
+pointwise
+postprocessing
+pre
+pre
+preallocate
+preallocates
+preallocation
+precompute
+precomputing
+prepend
+preprocess
+preprocessing
+prepruned
+prespecified
+pretrained
+prewritten
+primals
+processgroup
+profiler
+profilers
+protobuf
+py
+pytorch
+quantized
+quantizing
+queryable
+randint
+randn
+readably
+recomputation
+reenable
+regressor
+reimplement
+reimplementing
+reimplements
+reinitializes
+relu
+reproducibility
+rescale
+rescaling
+reshard
+resnet
+restride
+rewinded
+rgb
+rollout
+rollouts
+romanized
+runnable
+runtime
+runtime
+runtimes
+scalable
+SDPA
+sharded
+softmax
+sparsified
+sparsifier
+sparsifiers
+sparsify
+sparsifying
+specificities
+src
+stacktrace
+stateful
+storages
+strided
+stroma
+subclasses
+subclassing
+subcomponent
+subcomponents
+subdirectories
+subfolder
+submodule
+submodules
+subnetworks
+subproblems
+subprocess
+subprocesses
+subreddit
+subregion
+subregion's
+summarization
+swappable
+tanh
+tensor's
+th
+thresholding
+tiatoolbox
+timestep
+timesteps
+tmp
+tokenization
+tokenize
+tokenized
+tokenizer
+tokenizes
+tooltip
+topologies
+torchaudio
+torchdata
+torchrun
+torchscriptable
+torchtext
+torchtext's
+torchvision
+torchviz
+traceback
+tradeoff
+tradeoffs
+triton
+uint
+UX
+umap
+unbacked
+uncomment
+uncommented
+underflowing
+unfused
+unicode
+unimodal
+unigram
+unnormalized
+unoptimized
+unparametrized
+unpickling
+unpruned
+unscale
+unscaled
+unscales
+upscaled
+utils
+vectorization
+vectorize
+vectorized
+vhp
+voc
+walkthrough
+warmstart
+warmstarted
+warmstarting
+warmup
+webp
+wikitext
+wsi
+wsis
+Meta's
+RecSys
+TorchRec
+sharding
+TBE
+EBC
+sharder
+hyperoptimized
+DMP
+unsharded
+lookups
+KJTs
+amongst
+async
+everytime
+prototyped
+GBs
+HBM
+gloo
+nccl
+Localhost
+gpu
+torchmetrics
+url
+colab
+sharders
+Criteo
+torchrec
+_batch_norm_impl_index
+convolution_overrideable
+aten
+XPU
+XPUs
+impl
+overrideable
+TorchServe
+Inductor’s
+onwards
+recompilations
+BiasCorrection
+ELU
+GELU
+NNCF
+OpenVINO
+OpenVINOQuantizer
+PReLU
+Quantizer
+SmoothQuant
+quantizer
+quantizers
\ No newline at end of file
diff --git a/extension.rst b/extension.rst
new file mode 100644
index 00000000000..ee4d4524418
--- /dev/null
+++ b/extension.rst
@@ -0,0 +1,106 @@
+:orphan:
+
+Extension
+=========
+
+This section provides insights into extending PyTorch's capabilities.
+It covers custom operations, frontend APIs, and advanced topics like
+C++ extensions and dispatcher usage.
+
+.. raw:: html
+
+    <div id="tutorial-cards-container">
+
+    <nav class="navbar navbar-expand-lg navbar-light tutorials-nav col-12">
+        <div class="tutorial-tags-container">
+            <div id="dropdown-filter-tags">
+                <div class="tutorial-filter-menu">
+                    <div class="tutorial-filter filter-btn all-tag-selected" data-tag="all">All</div>
+                </div>
+            </div>
+        </div>
+    </nav>
+
+    <hr class="tutorials-hr">
+
+    <div class="row">
+
+    <div id="tutorial-cards">
+    <div class="list">
+
+.. Add tutorial cards below this line
+.. customcarditem::
+   :header: PyTorch Custom Operators Landing Page
+   :card_description: This is the landing page for all things related to custom operators in PyTorch.
+   :image: _static/img/thumbnails/cropped/Custom-Cpp-and-CUDA-Extensions.png
+   :link: advanced/custom_ops_landing_page.html
+   :tags: Extending-PyTorch,Frontend-APIs,C++,CUDA
+
+.. customcarditem::
+   :header: Custom Python Operators
+   :card_description: Create Custom Operators in Python. Useful for black-boxing a Python function for use with torch.compile.
+   :image: _static/img/thumbnails/cropped/Custom-Cpp-and-CUDA-Extensions.png
+   :link: advanced/python_custom_ops.html
+   :tags: Extending-PyTorch,Frontend-APIs,C++,CUDA
+
+.. customcarditem::
+   :header: Custom C++ and CUDA Operators
+   :card_description: How to extend PyTorch with custom C++ and CUDA operators.
+   :image: _static/img/thumbnails/cropped/Custom-Cpp-and-CUDA-Extensions.png
+   :link: advanced/cpp_custom_ops.html
+   :tags: Extending-PyTorch,Frontend-APIs,C++,CUDA
+
+.. customcarditem::
+   :header: Custom Function Tutorial: Double Backward
+   :card_description: Learn how to write a custom autograd Function that supports double backward.
+   :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: intermediate/custom_function_double_backward_tutorial.html
+   :tags: Extending-PyTorch,Frontend-APIs
+
+.. customcarditem::
+   :header: Custom Function Tutorial: Fusing Convolution and Batch Norm
+   :card_description: Learn how to create a custom autograd Function that fuses batch norm into a convolution to improve memory usage.
+   :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: intermediate/custom_function_conv_bn_tutorial.html
+   :tags: Extending-PyTorch,Frontend-APIs
+
+.. customcarditem::
+   :header: Registering a Dispatched Operator in C++
+   :card_description: The dispatcher is an internal component of PyTorch which is responsible for figuring out what code should actually get run when you call a function like torch::add.
+   :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: advanced/dispatcher.html
+   :tags: Extending-PyTorch,Frontend-APIs,C++
+
+.. customcarditem::
+   :header: Extending Dispatcher For a New Backend in C++
+   :card_description: Learn how to extend the dispatcher to add a new device living outside of the pytorch/pytorch repo and maintain it to keep in sync with native PyTorch devices.
+   :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: advanced/extend_dispatcher.html
+   :tags: Extending-PyTorch,Frontend-APIs,C++
+
+.. customcarditem::
+   :header: Facilitating New Backend Integration by PrivateUse1
+   :card_description: Learn how to integrate a new backend living outside of the pytorch/pytorch repo and maintain it to keep in sync with the native PyTorch backend.
+   :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: advanced/privateuseone.html
+   :tags: Extending-PyTorch,Frontend-APIs,C++
+
+.. End of tutorial card section
+.. -----------------------------------------
+.. Page TOC
+.. -----------------------------------------
+.. toctree::
+   :maxdepth: 2
+   :includehidden:
+   :hidden:
+   :caption: Extending PyTorch
+
+   advanced/custom_ops_landing_page
+   advanced/python_custom_ops
+   advanced/cpp_custom_ops
+   intermediate/custom_function_double_backward_tutorial
+   intermediate/custom_function_conv_bn_tutorial
+   advanced/cpp_extension
+   advanced/dispatcher
+   advanced/extend_dispatcher
+   advanced/privateuseone
diff --git a/index.rst b/index.rst
index c4c43612b27..3f81d3d257a 100644
--- a/index.rst
+++ b/index.rst
@@ -1,137 +1,841 @@
 Welcome to PyTorch Tutorials
 ============================
 
-To get started with learning PyTorch, start with our Beginner Tutorials. 
-The :doc:`60-minute blitz </beginner/deep_learning_60min_blitz>` is the most common 
-starting point, and gives you a quick introduction to PyTorch.
-If you like learning by examples, you will like the tutorial 
-:doc:`/beginner/pytorch_with_examples`
+**What's new in PyTorch tutorials?**
 
-If you would like to do the tutorials interactively via IPython / Jupyter, 
-each tutorial has a download link for a Jupyter Notebook and Python source code.
+* `Integrating Custom Operators with SYCL for Intel GPU <https://pytorch.org/tutorials/advanced/cpp_custom_ops_sycl.html>`__
+* `Supporting Custom C++ Classes in torch.compile/torch.export <https://docs.pytorch.org/tutorials/advanced/custom_class_pt2.html>`__
+* `Accelerating torch.save and torch.load with GPUDirect Storage <https://docs.pytorch.org/tutorials/unstable/gpu_direct_storage.html>`__
+* `Getting Started with Fully Sharded Data Parallel (FSDP2) <https://docs.pytorch.org/tutorials/intermediate/FSDP_tutorial.html>`__
 
-We also provide a lot of high-quality examples covering image classification, 
-unsupervised learning, reinforcement learning, machine translation and 
-many other applications at https://github.com/pytorch/examples/
-
-You can find reference documentation for PyTorch's API and layers at 
-http://docs.pytorch.org or via inline help.
+.. raw:: html
 
-If you would like the tutorials section improved, please open a github issue
-here with your feedback: https://github.com/pytorch/tutorials
+    <div class="tutorials-callout-container">
+        <div class="row">
 
-Beginner Tutorials
-------------------
-   
-.. customgalleryitem::
-   :figure: /_static/img/thumbnails/pytorch-logo-flat.png
-   :tooltip: Understand PyTorch’s Tensor library and neural networks at a high level.
-   :description: :doc:`/beginner/deep_learning_60min_blitz`
+.. Add callout items below this line
 
-.. customgalleryitem::
-   :tooltip: Understand similarities and differences between torch and pytorch.
-   :figure: /_static/img/thumbnails/torch-logo.png
-   :description: :doc:`/beginner/former_torchies_tutorial`
+.. customcalloutitem::
+   :description: Familiarize yourself with PyTorch concepts and modules. Learn how to load data, build deep neural networks, train and save your models in this quickstart guide.
+   :header: Learn the Basics
+   :button_link:  beginner/basics/intro.html
+   :button_text: Get started with PyTorch
 
-.. customgalleryitem::
-   :tooltip: This tutorial introduces the fundamental concepts of PyTorch through self-contained examples.
-   :figure: /_static/img/thumbnails/examples.png
-   :description: :doc:`/beginner/pytorch_with_examples`
+.. customcalloutitem::
+   :description: Bite-size, ready-to-deploy PyTorch code examples.
+   :header: PyTorch Recipes
+   :button_link: recipes_index.html
+   :button_text: Explore Recipes
 
-.. galleryitem:: beginner/transfer_learning_tutorial.py
+.. End of callout item section
 
-.. galleryitem:: beginner/data_loading_tutorial.py
+.. raw:: html
 
-.. customgalleryitem::
-    :tooltip: I am writing this tutorial to focus specifically on NLP for people who have never written code in any deep learning framework
-    :figure: /_static/img/thumbnails/babel.jpg
-    :description: :doc:`/beginner/deep_learning_nlp_tutorial`
+        </div>
+    </div>
+
+    <div id="tutorial-cards-container">
+
+    <nav class="navbar navbar-expand-lg navbar-light tutorials-nav col-12">
+        <div class="tutorial-tags-container">
+            <div id="dropdown-filter-tags">
+                <div class="tutorial-filter-menu">
+                    <div class="tutorial-filter filter-btn all-tag-selected" data-tag="all">All</div>
+                </div>
+            </div>
+        </div>
+    </nav>
+
+    <hr class="tutorials-hr">
+
+    <div class="row">
+
+    <div id="tutorial-cards">
+    <div class="list">
+
+.. Add tutorial cards below this line
+
+.. Learning PyTorch
+
+.. customcarditem::
+   :header: Learn the Basics
+   :card_description: A step-by-step guide to building a complete ML workflow with PyTorch.
+   :image: _static/img/thumbnails/cropped/60-min-blitz.png
+   :link: beginner/basics/intro.html
+   :tags: Getting-Started
+
+.. customcarditem::
+   :header: Introduction to PyTorch on YouTube
+   :card_description: An introduction to building a complete ML workflow with PyTorch. Follows the PyTorch Beginner Series on YouTube.
+   :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: beginner/introyt/introyt_index.html
+   :tags: Getting-Started
+
+.. customcarditem::
+   :header: Learning PyTorch with Examples
+   :card_description: This tutorial introduces the fundamental concepts of PyTorch through self-contained examples.
+   :image: _static/img/thumbnails/cropped/learning-pytorch-with-examples.png
+   :link: beginner/pytorch_with_examples.html
+   :tags: Getting-Started
+
+.. customcarditem::
+   :header: What is torch.nn really?
+   :card_description: Use torch.nn to create and train a neural network.
+   :image: _static/img/thumbnails/cropped/torch-nn.png
+   :link: beginner/nn_tutorial.html
+   :tags: Getting-Started
+
+.. customcarditem::
+   :header: Visualizing Models, Data, and Training with TensorBoard
+   :card_description: Learn to use TensorBoard to visualize data and model training.
+   :image: _static/img/thumbnails/cropped/visualizing-with-tensorboard.png
+   :link: intermediate/tensorboard_tutorial.html
+   :tags: Interpretability,Getting-Started,TensorBoard
+
+.. customcarditem::
+   :header: Good usage of `non_blocking` and `pin_memory()` in PyTorch
+   :card_description: A guide on best practices to copy data from CPU to GPU.
+   :image: _static/img/pinmem.png
+   :link: intermediate/pinmem_nonblock.html
+   :tags: Getting-Started
+
+.. customcarditem::
+   :header: Understanding requires_grad, retain_grad, Leaf, and Non-leaf Tensors
+   :card_description: Learn the subtleties of requires_grad, retain_grad, leaf, and non-leaf tensors
+   :image: _static/img/thumbnails/cropped/understanding_leaf_vs_nonleaf.png
+   :link: beginner/understanding_leaf_vs_nonleaf_tutorial.html
+   :tags: Getting-Started
+
+.. customcarditem::
+   :header: Visualizing Gradients in PyTorch
+   :card_description: Visualize the gradient flow of a network.
+   :image: _static/img/thumbnails/cropped/visualizing_gradients_tutorial.png
+   :link: intermediate/visualizing_gradients_tutorial.html
+   :tags: Getting-Started
+
+.. Image/Video
+
+.. customcarditem::
+   :header: TorchVision Object Detection Finetuning Tutorial
+   :card_description: Finetune a pre-trained Mask R-CNN model.
+   :image: _static/img/thumbnails/cropped/TorchVision-Object-Detection-Finetuning-Tutorial.png
+   :link: intermediate/torchvision_tutorial.html
+   :tags: Image/Video
+
+.. customcarditem::
+   :header: Transfer Learning for Computer Vision Tutorial
+   :card_description: Train a convolutional neural network for image classification using transfer learning.
+   :image: _static/img/thumbnails/cropped/Transfer-Learning-for-Computer-Vision-Tutorial.png
+   :link: beginner/transfer_learning_tutorial.html
+   :tags: Image/Video
+
+.. customcarditem::
+   :header: Adversarial Example Generation
+   :card_description: Train a convolutional neural network for image classification using transfer learning.
+   :image: _static/img/thumbnails/cropped/Adversarial-Example-Generation.png
+   :link: beginner/fgsm_tutorial.html
+   :tags: Image/Video
+
+.. customcarditem::
+   :header: DCGAN Tutorial
+   :card_description: Train a generative adversarial network (GAN) to generate new celebrities.
+   :image: _static/img/thumbnails/cropped/DCGAN-Tutorial.png
+   :link: beginner/dcgan_faces_tutorial.html
+   :tags: Image/Video
+
+.. customcarditem::
+   :header: Spatial Transformer Networks Tutorial
+   :card_description: Learn how to augment your network using a visual attention mechanism.
+   :image: _static/img/stn/Five.gif
+   :link: intermediate/spatial_transformer_tutorial.html
+   :tags: Image/Video
+
+
+.. customcarditem::
+   :header: Semi-Supervised Learning Tutorial Based on USB
+   :card_description: Learn how to train semi-supervised learning algorithms (on custom data) using USB and PyTorch.
+   :image: _static/img/usb_semisup_learn/code.png
+   :link: advanced/usb_semisup_learn.html
+   :tags: Image/Video
+
+.. Audio
+
+.. customcarditem::
+   :header: Audio IO
+   :card_description: Learn to load data with torchaudio.
+   :image: _static/img/thumbnails/cropped/torchaudio-Tutorial.png
+   :link: beginner/audio_io_tutorial.html
+   :tags: Audio
+
+.. customcarditem::
+   :header: Audio Resampling
+   :card_description: Learn to resample audio waveforms using torchaudio.
+   :image: _static/img/thumbnails/cropped/torchaudio-Tutorial.png
+   :link: beginner/audio_resampling_tutorial.html
+   :tags: Audio
+
+.. customcarditem::
+   :header: Audio Data Augmentation
+   :card_description: Learn to apply data augmentations using torchaudio.
+   :image: _static/img/thumbnails/cropped/torchaudio-Tutorial.png
+   :link: beginner/audio_data_augmentation_tutorial.html
+   :tags: Audio
+
+.. customcarditem::
+   :header: Audio Feature Extractions
+   :card_description: Learn to extract features using torchaudio.
+   :image: _static/img/thumbnails/cropped/torchaudio-Tutorial.png
+   :link: beginner/audio_feature_extractions_tutorial.html
+   :tags: Audio
+
+.. customcarditem::
+   :header: Audio Feature Augmentation
+   :card_description: Learn to augment features using torchaudio.
+   :image: _static/img/thumbnails/cropped/torchaudio-Tutorial.png
+   :link: beginner/audio_feature_augmentation_tutorial.html
+   :tags: Audio
+
+.. customcarditem::
+   :header: Audio Datasets
+   :card_description: Learn to use torchaudio datasets.
+   :image: _static/img/thumbnails/cropped/torchaudio-Tutorial.png
+   :link: beginner/audio_datasets_tutorial.html
+   :tags: Audio
+
+.. customcarditem::
+   :header: Automatic Speech Recognition with Wav2Vec2 in torchaudio
+   :card_description: Learn how to use torchaudio's pretrained models for building a speech recognition application.
+   :image: _static/img/thumbnails/cropped/torchaudio-asr.png
+   :link: intermediate/speech_recognition_pipeline_tutorial.html
+   :tags: Audio
+
+.. customcarditem::
+   :header: Speech Command Classification
+   :card_description: Learn how to correctly format an audio dataset and then train/test an audio classifier network on the dataset.
+   :image: _static/img/thumbnails/cropped/torchaudio-speech.png
+   :link: intermediate/speech_command_classification_with_torchaudio_tutorial.html
+   :tags: Audio
+
+.. customcarditem::
+   :header: Text-to-Speech with torchaudio
+   :card_description: Learn how to use torchaudio's pretrained models for building a text-to-speech application.
+   :image: _static/img/thumbnails/cropped/torchaudio-speech.png
+   :link: intermediate/text_to_speech_with_torchaudio.html
+   :tags: Audio
+
+.. customcarditem::
+   :header: Forced Alignment with Wav2Vec2 in torchaudio
+   :card_description: Learn how to use torchaudio's Wav2Vec2 pretrained models for aligning text to speech
+   :image: _static/img/thumbnails/cropped/torchaudio-alignment.png
+   :link: intermediate/forced_alignment_with_torchaudio_tutorial.html
+   :tags: Audio
+
+.. NLP
+
+.. customcarditem::
+   :header: NLP from Scratch: Classifying Names with a Character-level RNN
+   :card_description: Build and train a basic character-level RNN to classify word from scratch without the use of torchtext. First in a series of three tutorials.
+   :image: _static/img/thumbnails/cropped/NLP-From-Scratch-Classifying-Names-with-a-Character-Level-RNN.png
+   :link: intermediate/char_rnn_classification_tutorial
+   :tags: NLP
+
+.. customcarditem::
+   :header: NLP from Scratch: Generating Names with a Character-level RNN
+   :card_description: After using character-level RNN to classify names, learn how to generate names from languages. Second in a series of three tutorials.
+   :image: _static/img/thumbnails/cropped/NLP-From-Scratch-Generating-Names-with-a-Character-Level-RNN.png
+   :link: intermediate/char_rnn_generation_tutorial.html
+   :tags: NLP
+
+.. customcarditem::
+   :header: NLP from Scratch: Translation with a Sequence-to-sequence Network and Attention
+   :card_description: This is the third and final tutorial on doing “NLP From Scratch”, where we write our own classes and functions to preprocess the data to do our NLP modeling tasks.
+   :image: _static/img/thumbnails/cropped/NLP-From-Scratch-Translation-with-a-Sequence-to-Sequence-Network-and-Attention.png
+   :link: intermediate/seq2seq_translation_tutorial.html
+   :tags: NLP
+
+.. ONNX
+
+.. customcarditem::
+   :header: Exporting a PyTorch model to ONNX using TorchDynamo backend and Running it using ONNX Runtime
+   :card_description: Build a image classifier model in PyTorch and convert it to ONNX before deploying it with ONNX Runtime.
+   :image: _static/img/thumbnails/cropped/Exporting-PyTorch-Models-to-ONNX-Graphs.png
+   :link: beginner/onnx/export_simple_model_to_onnx_tutorial.html
+   :tags: Production,ONNX,Backends
+
+.. customcarditem::
+   :header: Extending the ONNX exporter operator support
+   :card_description: Demonstrate end-to-end how to address unsupported operators in ONNX.
+   :image: _static/img/thumbnails/cropped/Exporting-PyTorch-Models-to-ONNX-Graphs.png
+   :link: beginner/onnx/onnx_registry_tutorial.html
+   :tags: Production,ONNX,Backends
+
+.. customcarditem::
+   :header: Exporting a model with control flow to ONNX
+   :card_description: Demonstrate how to handle control flow logic while exporting a PyTorch model to ONNX.
+   :image: _static/img/thumbnails/cropped/Exporting-PyTorch-Models-to-ONNX-Graphs.png
+   :link: beginner/onnx/export_control_flow_model_to_onnx_tutorial.html
+   :tags: Production,ONNX,Backends
+
+.. Reinforcement Learning
+
+.. customcarditem::
+   :header: Reinforcement Learning (DQN)
+   :card_description: Learn how to use PyTorch to train a Deep Q Learning (DQN) agent on the CartPole-v0 task from the OpenAI Gym.
+   :image: _static/img/cartpole.gif
+   :link: intermediate/reinforcement_q_learning.html
+   :tags: Reinforcement-Learning
+
+.. customcarditem::
+   :header: Reinforcement Learning (PPO) with TorchRL
+   :card_description: Learn how to use PyTorch and TorchRL to train a Proximal Policy Optimization agent on the Inverted Pendulum task from Gym.
+   :image: _static/img/invpendulum.gif
+   :link: intermediate/reinforcement_ppo.html
+   :tags: Reinforcement-Learning
+
+.. customcarditem::
+   :header: Train a Mario-playing RL Agent
+   :card_description: Use PyTorch to train a Double Q-learning agent to play Mario.
+   :image: _static/img/mario.gif
+   :link: intermediate/mario_rl_tutorial.html
+   :tags: Reinforcement-Learning
+
+.. customcarditem::
+   :header: Recurrent DQN
+   :card_description: Use TorchRL to train recurrent policies
+   :image: _static/img/rollout_recurrent.png
+   :link: intermediate/dqn_with_rnn_tutorial.html
+   :tags: Reinforcement-Learning
+
+.. customcarditem::
+   :header: Code a DDPG Loss
+   :card_description: Use TorchRL to code a DDPG Loss
+   :image: _static/img/half_cheetah.gif
+   :link: advanced/coding_ddpg.html
+   :tags: Reinforcement-Learning
+
+.. customcarditem::
+   :header: Writing your environment and transforms
+   :card_description: Use TorchRL to code a Pendulum
+   :image: _static/img/pendulum.gif
+   :link: advanced/pendulum.html
+   :tags: Reinforcement-Learning
+
+.. Deploying PyTorch Models in Production
+
+.. customcarditem::
+   :header: Profiling PyTorch
+   :card_description: Learn how to profile a PyTorch application
+   :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: beginner/profiler.html
+   :tags: Profiling
+
+.. customcarditem::
+   :header: Profiling PyTorch
+   :card_description: Introduction to Holistic Trace Analysis
+   :link: beginner/hta_intro_tutorial.html
+   :tags: Profiling
+
+.. customcarditem::
+   :header: Profiling PyTorch
+   :card_description: Trace Diff using Holistic Trace Analysis
+   :link: beginner/hta_trace_diff_tutorial.html
+   :tags: Profiling
+
+.. Code Transformations with FX
+
+.. customcarditem::
+   :header: Building a Simple Performance Profiler with FX
+   :card_description: Build a simple FX interpreter to record the runtime of op, module, and function calls and report statistics
+   :image: _static/img/thumbnails/cropped/Deploying-PyTorch-in-Python-via-a-REST-API-with-Flask.png
+   :link: intermediate/fx_profiling_tutorial.html
+   :tags: FX
+
+.. Frontend APIs
+
+.. customcarditem::
+   :header: (beta) Channels Last Memory Format in PyTorch
+   :card_description: Get an overview of Channels Last memory format and understand how it is used to order NCHW tensors in memory preserving dimensions.
+   :image: _static/img/thumbnails/cropped/experimental-Channels-Last-Memory-Format-in-PyTorch.png
+   :link: intermediate/memory_format_tutorial.html
+   :tags: Memory-Format,Best-Practice,Frontend-APIs
+
+.. customcarditem::
+   :header: Using the PyTorch C++ Frontend
+   :card_description: Walk through an end-to-end example of training a model with the C++ frontend by training a DCGAN – a kind of generative model – to generate images of MNIST digits.
+   :image: _static/img/thumbnails/cropped/Using-the-PyTorch-Cpp-Frontend.png
+   :link: advanced/cpp_frontend.html
+   :tags: Frontend-APIs,C++
+
+.. customcarditem::
+   :header: PyTorch Custom Operators Landing Page
+   :card_description: This is the landing page for all things related to custom operators in PyTorch.
+   :image: _static/img/thumbnails/cropped/Custom-Cpp-and-CUDA-Extensions.png
+   :link: advanced/custom_ops_landing_page.html
+   :tags: Extending-PyTorch,Frontend-APIs,C++,CUDA
+
+.. customcarditem::
+   :header: Custom Python Operators
+   :card_description: Create Custom Operators in Python. Useful for black-boxing a Python function for use with torch.compile.
+   :image: _static/img/thumbnails/cropped/Custom-Cpp-and-CUDA-Extensions.png
+   :link: advanced/python_custom_ops.html
+   :tags: Extending-PyTorch,Frontend-APIs,C++,CUDA
+
+.. customcarditem::
+   :header: Compiled Autograd: Capturing a larger backward graph for ``torch.compile``
+   :card_description: Learn how to use compiled autograd to capture a larger backward graph.
+   :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: intermediate/compiled_autograd_tutorial
+   :tags: Model-Optimization,CUDA
+
+.. customcarditem::
+   :header: Custom C++ and CUDA Operators
+   :card_description: How to extend PyTorch with custom C++ and CUDA operators.
+   :image: _static/img/thumbnails/cropped/Custom-Cpp-and-CUDA-Extensions.png
+   :link: advanced/cpp_custom_ops.html
+   :tags: Extending-PyTorch,Frontend-APIs,C++,CUDA
+
+.. customcarditem::
+   :header: Autograd in C++ Frontend
+   :card_description: The autograd package helps build flexible and dynamic neural netorks. In this tutorial, explore several examples of doing autograd in PyTorch C++ frontend
+   :image: _static/img/thumbnails/cropped/Autograd-in-Cpp-Frontend.png
+   :link: advanced/cpp_autograd.html
+   :tags: Frontend-APIs,C++
+
+.. customcarditem::
+   :header: Registering a Dispatched Operator in C++
+   :card_description: The dispatcher is an internal component of PyTorch which is responsible for figuring out what code should actually get run when you call a function like torch::add.
+   :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: advanced/dispatcher.html
+   :tags: Extending-PyTorch,Frontend-APIs,C++
+
+.. customcarditem::
+   :header: Extending Dispatcher For a New Backend in C++
+   :card_description: Learn how to extend the dispatcher to add a new device living outside of the pytorch/pytorch repo and maintain it to keep in sync with native PyTorch devices.
+   :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: advanced/extend_dispatcher.html
+   :tags: Extending-PyTorch,Frontend-APIs,C++
+
+.. customcarditem::
+   :header: Facilitating New Backend Integration by PrivateUse1
+   :card_description: Learn how to integrate a new backend living outside of the pytorch/pytorch repo and maintain it to keep in sync with the native PyTorch backend.
+   :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: advanced/privateuseone.html
+   :tags: Extending-PyTorch,Frontend-APIs,C++
+
+.. customcarditem::
+   :header: Custom Function Tutorial: Double Backward
+   :card_description: Learn how to write a custom autograd Function that supports double backward.
+   :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: intermediate/custom_function_double_backward_tutorial.html
+   :tags: Extending-PyTorch,Frontend-APIs
+
+.. customcarditem::
+   :header: Custom Function Tutorial: Fusing Convolution and Batch Norm
+   :card_description: Learn how to create a custom autograd Function that fuses batch norm into a convolution to improve memory usage.
+   :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: intermediate/custom_function_conv_bn_tutorial.html
+   :tags: Extending-PyTorch,Frontend-APIs
+
+.. customcarditem::
+   :header: Forward-mode Automatic Differentiation
+   :card_description: Learn how to use forward-mode automatic differentiation.
+   :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: intermediate/forward_ad_usage.html
+   :tags: Frontend-APIs
+
+.. customcarditem::
+   :header: Jacobians, Hessians, hvp, vhp, and more
+   :card_description: Learn how to compute advanced autodiff quantities using torch.func
+   :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: intermediate/jacobians_hessians.html
+   :tags: Frontend-APIs
+
+.. customcarditem::
+   :header: Model Ensembling
+   :card_description: Learn how to ensemble models using torch.vmap
+   :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: intermediate/ensembling.html
+   :tags: Frontend-APIs
+
+.. customcarditem::
+   :header: Per-Sample-Gradients
+   :card_description: Learn how to compute per-sample-gradients using torch.func
+   :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: intermediate/per_sample_grads.html
+   :tags: Frontend-APIs
+
+.. customcarditem::
+   :header: Neural Tangent Kernels
+   :card_description: Learn how to compute neural tangent kernels using torch.func
+   :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: intermediate/neural_tangent_kernels.html
+   :tags: Frontend-APIs
+
+.. Model Optimization
+
+.. customcarditem::
+   :header: Performance Profiling in PyTorch
+   :card_description: Learn how to use the PyTorch Profiler to benchmark your module's performance.
+   :image: _static/img/thumbnails/cropped/profiler.png
+   :link: beginner/profiler.html
+   :tags: Model-Optimization,Best-Practice,Profiling
+
+.. customcarditem::
+   :header: Performance Profiling in TensorBoard
+   :card_description: Learn how to use the TensorBoard plugin to profile and analyze your model's performance.
+   :image: _static/img/thumbnails/cropped/profiler.png
+   :link: intermediate/tensorboard_profiler_tutorial.html
+   :tags: Model-Optimization,Best-Practice,Profiling,TensorBoard
+
+.. customcarditem::
+   :header: Hyperparameter Tuning Tutorial
+   :card_description: Learn how to use Ray Tune to find the best performing set of hyperparameters for your model.
+   :image: _static/img/ray-tune.png
+   :link: beginner/hyperparameter_tuning_tutorial.html
+   :tags: Model-Optimization,Best-Practice
+
+.. customcarditem::
+   :header: Parametrizations Tutorial
+   :card_description: Learn how to use torch.nn.utils.parametrize to put constraints on your parameters (e.g. make them orthogonal, symmetric positive definite, low-rank...)
+   :image: _static/img/thumbnails/cropped/parametrizations.png
+   :link: intermediate/parametrizations.html
+   :tags: Model-Optimization,Best-Practice
+
+.. customcarditem::
+   :header: Pruning Tutorial
+   :card_description: Learn how to use torch.nn.utils.prune to sparsify your neural networks, and how to extend it to implement your own custom pruning technique.
+   :image: _static/img/thumbnails/cropped/Pruning-Tutorial.png
+   :link: intermediate/pruning_tutorial.html
+   :tags: Model-Optimization,Best-Practice
+
+.. customcarditem::
+   :header: How to save memory by fusing the optimizer step into the backward pass
+   :card_description: Learn a memory-saving technique through fusing the optimizer step into the backward pass using memory snapshots.
+   :image: _static/img/thumbnails/cropped/pytorch-logo.png
+   :link: intermediate/optimizer_step_in_backward_tutorial.html
+   :tags: Model-Optimization,Best-Practice,CUDA,Frontend-APIs
+
+.. customcarditem::
+   :header: (beta) Accelerating BERT with semi-structured sparsity
+   :card_description: Train BERT, prune it to be 2:4 sparse, and then accelerate it to achieve 2x inference speedups with semi-structured sparsity and torch.compile.
+   :image: _static/img/thumbnails/cropped/Pruning-Tutorial.png
+   :link: advanced/semi_structured_sparse.html
+   :tags: Text,Model-Optimization
+
+.. customcarditem::
+   :header: Multi-Objective Neural Architecture Search with Ax
+   :card_description: Learn how to use Ax to search over architectures find optimal tradeoffs between accuracy and latency.
+   :image: _static/img/ax_logo.png
+   :link: intermediate/ax_multiobjective_nas_tutorial.html
+   :tags: Model-Optimization,Best-Practice,Ax,TorchX
+
+.. customcarditem::
+   :header: torch.compile Tutorial
+   :card_description: Speed up your models with minimal code changes using torch.compile, the latest PyTorch compiler solution.
+   :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: intermediate/torch_compile_tutorial.html
+   :tags: Model-Optimization
+
+.. customcarditem::
+   :header: Building a Convolution/Batch Norm fuser in torch.compile
+   :card_description: Build a simple pattern matcher pass that fuses batch norm into convolution to improve performance during inference.
+   :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: intermediate/torch_compile_conv_bn_fuser.html
+   :tags: Model-Optimization
+
+.. customcarditem::
+   :header: Inductor CPU Backend Debugging and Profiling
+   :card_description: Learn the usage, debugging and performance profiling for ``torch.compile`` with Inductor CPU backend.
+   :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: intermediate/inductor_debug_cpu.html
+   :tags: Model-Optimization
+
+.. customcarditem::
+   :header: (beta) Implementing High-Performance Transformers with SCALED DOT PRODUCT ATTENTION
+   :card_description: This tutorial explores the new torch.nn.functional.scaled_dot_product_attention and how it can be used to construct Transformer components.
+   :image: _static/img/thumbnails/cropped/pytorch-logo.png
+   :link: intermediate/scaled_dot_product_attention_tutorial.html
+   :tags: Model-Optimization,Attention,Transformer
+
+.. customcarditem::
+   :header: Knowledge Distillation in Convolutional Neural Networks
+   :card_description:  Learn how to improve the accuracy of lightweight models using more powerful models as teachers.
+   :image: _static/img/thumbnails/cropped/knowledge_distillation_pytorch_logo.png
+   :link: beginner/knowledge_distillation_tutorial.html
+   :tags: Model-Optimization,Image/Video
+
+
+.. customcarditem::
+   :header: Accelerating PyTorch Transformers by replacing nn.Transformer with Nested Tensors and torch.compile()
+   :card_description: This tutorial goes over recommended best practices for implementing Transformers with native PyTorch.
+   :image: _static/img/thumbnails/cropped/pytorch-logo.png
+   :link: intermediate/transformer_building_blocks.html
+   :tags: Transformer
+
+.. Parallel-and-Distributed-Training
+
+
+
+.. customcarditem::
+   :header: PyTorch Distributed Overview
+   :card_description: Briefly go over all concepts and features in the distributed package. Use this document to find the distributed training technology that can best serve your application.
+   :image: _static/img/thumbnails/cropped/PyTorch-Distributed-Overview.png
+   :link: beginner/dist_overview.html
+   :tags: Parallel-and-Distributed-Training
+
+.. customcarditem::
+   :header: Distributed Data Parallel in PyTorch - Video Tutorials
+   :card_description: This series of video tutorials walks you through distributed training in PyTorch via DDP.
+   :image: _static/img/thumbnails/cropped/PyTorch-Distributed-Overview.png
+   :link: beginner/ddp_series_intro.html
+   :tags: Parallel-and-Distributed-Training
+
+.. customcarditem::
+   :header: Single-Machine Model Parallel Best Practices
+   :card_description:  Learn how to implement model parallel, a distributed training technique which splits a single model onto different GPUs, rather than replicating the entire model on each GPU
+   :image: _static/img/thumbnails/cropped/Model-Parallel-Best-Practices.png
+   :link: intermediate/model_parallel_tutorial.html
+   :tags: Parallel-and-Distributed-Training
+
+.. customcarditem::
+   :header: Getting Started with Distributed Data Parallel
+   :card_description: Learn the basics of when to use distributed data paralle versus data parallel and work through an example to set it up.
+   :image: _static/img/thumbnails/cropped/Getting-Started-with-Distributed-Data-Parallel.png
+   :link: intermediate/ddp_tutorial.html
+   :tags: Parallel-and-Distributed-Training
+
+.. customcarditem::
+   :header: Writing Distributed Applications with PyTorch
+   :card_description: Set up the distributed package of PyTorch, use the different communication strategies, and go over some the internals of the package.
+   :image: _static/img/thumbnails/cropped/Writing-Distributed-Applications-with-PyTorch.png
+   :link: intermediate/dist_tuto.html
+   :tags: Parallel-and-Distributed-Training
+
+.. customcarditem::
+   :header: Large Scale Transformer model training with Tensor Parallel
+   :card_description: Learn how to train large models with Tensor Parallel package.
+   :image: _static/img/thumbnails/cropped/Large-Scale-Transformer-model-training-with-Tensor-Parallel.png
+   :link: intermediate/TP_tutorial.html
+   :tags: Parallel-and-Distributed-Training
+
+.. customcarditem::
+   :header: Customize Process Group Backends Using Cpp Extensions
+   :card_description: Extend ProcessGroup with custom collective communication implementations.
+   :image: _static/img/thumbnails/cropped/Customize-Process-Group-Backends-Using-Cpp-Extensions.png
+   :link: intermediate/process_group_cpp_extension_tutorial.html
+   :tags: Parallel-and-Distributed-Training
+
+.. customcarditem::
+   :header: Getting Started with Distributed RPC Framework
+   :card_description: Learn how to build distributed training using the torch.distributed.rpc package.
+   :image: _static/img/thumbnails/cropped/Getting Started with Distributed-RPC-Framework.png
+   :link: intermediate/rpc_tutorial.html
+   :tags: Parallel-and-Distributed-Training
+
+.. customcarditem::
+   :header: Implementing a Parameter Server Using Distributed RPC Framework
+   :card_description: Walk through a through a simple example of implementing a parameter server using PyTorch’s Distributed RPC framework.
+   :image: _static/img/thumbnails/cropped/Implementing-a-Parameter-Server-Using-Distributed-RPC-Framework.png
+   :link: intermediate/rpc_param_server_tutorial.html
+   :tags: Parallel-and-Distributed-Training
+
+.. customcarditem::
+   :header: Introduction to Distributed Pipeline Parallelism
+   :card_description: Demonstrate how to implement pipeline parallelism using torch.distributed.pipelining
+   :image: _static/img/thumbnails/cropped/Introduction-to-Distributed-Pipeline-Parallelism.png
+   :link: intermediate/pipelining_tutorial.html
+   :tags: Parallel-and-Distributed-Training
+
+.. customcarditem::
+   :header: Implementing Batch RPC Processing Using Asynchronous Executions
+   :card_description: Learn how to use rpc.functions.async_execution to implement batch RPC
+   :image: _static/img/thumbnails/cropped/Implementing-Batch-RPC-Processing-Using-Asynchronous-Executions.png
+   :link: intermediate/rpc_async_execution.html
+   :tags: Parallel-and-Distributed-Training
+
+.. customcarditem::
+   :header: Combining Distributed DataParallel with Distributed RPC Framework
+   :card_description: Walk through a through a simple example of how to combine distributed data parallelism with distributed model parallelism.
+   :image: _static/img/thumbnails/cropped/Combining-Distributed-DataParallel-with-Distributed-RPC-Framework.png
+   :link: advanced/rpc_ddp_tutorial.html
+   :tags: Parallel-and-Distributed-Training
+
+.. customcarditem::
+   :header: Getting Started with Fully Sharded Data Parallel (FSDP2)
+   :card_description: Learn how to train models with Fully Sharded Data Parallel (fully_shard) package.
+   :image: _static/img/thumbnails/cropped/Getting-Started-with-FSDP.png
+   :link: intermediate/FSDP_tutorial.html
+   :tags: Parallel-and-Distributed-Training
+
+.. customcarditem::
+   :header: Introduction to Libuv TCPStore Backend
+   :card_description: TCPStore now uses a new server backend for faster connection and better scalability.
+   :image: _static/img/thumbnails/cropped/Introduction-to-Libuv-Backend-TCPStore.png
+   :link: intermediate/TCPStore_libuv_backend.html
+   :tags: Parallel-and-Distributed-Training
+
+.. Edge
+
+.. customcarditem::
+   :header: Exporting to ExecuTorch Tutorial
+   :card_description: Learn about how to use ExecuTorch, a unified ML stack for lowering PyTorch models to edge devices.
+   :image: _static/img/ExecuTorch-Logo-cropped.svg
+   :link: https://pytorch.org/executorch/stable/tutorials/export-to-executorch-tutorial.html
+   :tags: Edge
+
+.. customcarditem::
+   :header: Running an ExecuTorch Model in C++ Tutorial
+   :card_description: Learn how to load and execute an ExecuTorch model in C++
+   :image: _static/img/ExecuTorch-Logo-cropped.svg
+   :link: https://pytorch.org/executorch/stable/running-a-model-cpp-tutorial.html
+   :tags: Edge
+
+.. customcarditem::
+   :header: Using the ExecuTorch SDK to Profile a Model
+   :card_description: Explore how to use the ExecuTorch SDK to profile, debug, and visualize ExecuTorch models
+   :image: _static/img/ExecuTorch-Logo-cropped.svg
+   :link: https://docs.pytorch.org/executorch/main/tutorials/devtools-integration-tutorial.html
+   :tags: Edge
+
+.. customcarditem::
+   :header: Building an ExecuTorch iOS Demo App
+   :card_description: Explore how to set up the ExecuTorch iOS Demo App, which uses the MobileNet v3 model to process live camera images leveraging three different backends: XNNPACK, Core ML, and Metal Performance Shaders (MPS).
+   :image: _static/img/ExecuTorch-Logo-cropped.svg
+   :link: https://github.com/meta-pytorch/executorch-examples/tree/main/mv3/apple/ExecuTorchDemo
+   :tags: Edge
+
+.. customcarditem::
+   :header: Building an ExecuTorch Android Demo App
+   :card_description: Learn how to set up the ExecuTorch Android Demo App for image segmentation tasks using the DeepLab v3 model and XNNPACK FP32 backend.
+   :image: _static/img/ExecuTorch-Logo-cropped.svg
+   :link: https://github.com/meta-pytorch/executorch-examples/tree/main/dl3/android/DeepLabV3Demo#executorch-android-demo-app
+   :tags: Edge
+
+.. customcarditem::
+   :header: Lowering a Model as a Delegate
+   :card_description: Learn to accelerate your program using ExecuTorch by applying delegates through three methods: lowering the whole module, composing it with another module, and partitioning parts of a module.
+   :image: _static/img/ExecuTorch-Logo-cropped.svg
+   :link: https://pytorch.org/executorch/stable/examples-end-to-end-to-lower-model-to-delegate.html
+   :tags: Edge
+
+
+.. Recommendation Systems
+
+.. customcarditem::
+   :header: Introduction to TorchRec
+   :card_description: TorchRec is a PyTorch domain library built to provide common sparsity & parallelism primitives needed for large-scale recommender systems.
+   :image: _static/img/thumbnails/torchrec.png
+   :link: intermediate/torchrec_intro_tutorial.html
+   :tags: TorchRec,Recommender
+
+.. customcarditem::
+   :header: Exploring TorchRec sharding
+   :card_description: This tutorial covers the sharding schemes of embedding tables by using <code>EmbeddingPlanner</code> and <code>DistributedModelParallel</code> API.
+   :image: _static/img/thumbnails/torchrec.png
+   :link: advanced/sharding.html
+   :tags: TorchRec,Recommender
+
+
+.. End of tutorial card section
 
 .. raw:: html
 
-    <div style='clear:both'></div>
+    </div>
 
+    <div class="pagination d-flex justify-content-center"></div>
 
-.. toctree::
-   :maxdepth: 2
-   :hidden:
-   :includehidden:
-   :caption: Beginner Tutorials
+    </div>
 
-   beginner/deep_learning_60min_blitz
-   beginner/former_torchies_tutorial
-   beginner/pytorch_with_examples
-   beginner/transfer_learning_tutorial
-   beginner/data_loading_tutorial
-   beginner/deep_learning_nlp_tutorial
+    </div>
+    <br>
+    <br>
 
-Intermediate Tutorials
-----------------------
 
-RNNs for NLP
-^^^^^^^^^^^^
-
-Applying recurrent neural networks to natural language tasks, from
-classification to generation.
+Additional Resources
+============================
 
+.. raw:: html
 
-.. galleryitem:: intermediate/char_rnn_classification_tutorial.py
-.. galleryitem:: intermediate/char_rnn_generation_tutorial.py
-  :figure: _static/img/char_rnn_generation.png
-.. galleryitem:: intermediate/seq2seq_translation_tutorial.py
-  :figure: _static/img/seq2seq_flat.png
+    <div class="tutorials-callout-container">
+        <div class="row">
 
-.. raw:: html
+.. Add callout items below this line
 
-    <div style='clear:both'></div>
+.. customcalloutitem::
+   :header: Examples of PyTorch
+   :description: A set of examples around PyTorch in Vision, Text, Reinforcement Learning that you can incorporate in your existing work.
+   :button_link: https://pytorch.org/examples?utm_source=examples&utm_medium=examples-landing
+   :button_text: Check Out Examples
 
-Reinforcement Learning
-^^^^^^^^^^^^^^^^^^^^^^
+.. customcalloutitem::
+   :header: Run Tutorials on Google Colab
+   :description: Learn how to copy tutorial data into Google Drive so that you can run tutorials on Google Colab.
+   :button_link: beginner/colab.html
+   :button_text: Open
 
-.. galleryitem:: intermediate/reinforcement_q_learning.py
-    :figure: _static/img/cartpole.gif
+.. End of callout section
 
 .. raw:: html
 
+        </div>
+    </div>
+
     <div style='clear:both'></div>
 
+.. -----------------------------------------
+.. Page TOC
+.. -----------------------------------------
+.. toctree::
+   :glob:
+   :maxdepth: 1
+   :hidden:
+
+   intro
+
+.. toctree::
+   :maxdepth: 1
+   :hidden:
+
+   compilers_index
 
 .. toctree::
    :maxdepth: 2
-   :includehidden:
    :hidden:
-   :caption: Intermediate Tutorials
-   
-   intermediate/char_rnn_classification_tutorial
-   intermediate/char_rnn_generation_tutorial
-   intermediate/seq2seq_translation_tutorial
-   intermediate/reinforcement_q_learning
 
-Advanced Tutorials
-------------------
+   domains
 
-.. galleryitem:: advanced/neural_style_tutorial.py
-    :intro: This tutorial explains how to impletment the Neural-Style algorithm developed by Leon A. Gatys, Alexander S. Ecker and Matthias Bethge.
+.. toctree:: 1
+   :hidden:
+   :maxdepth: 2
 
-.. galleryitem:: advanced/numpy_extensions_tutorial.py
+   distributed
 
-.. customgalleryitem::
-   :tooltip: Implement custom extensions in C.
-   :description: :doc:`/advanced/c_extension`
+.. toctree::
+   :maxdepth: 1
+   :hidden:
 
+   deep-dive
 
-.. raw:: html
+.. toctree::
+   :maxdepth: 1
+   :hidden:
 
-    <div style='clear:both'></div>
+   extension
+
+.. toctree::
+   :maxdepth: 1
+   :hidden:
 
+   ecosystem
 
 .. toctree::
-   :maxdepth: 2
-   :includehidden:
+   :maxdepth: 1
    :hidden:
-   :caption: Advanced Tutorials
 
-   advanced/neural_style_tutorial
-   advanced/numpy_extensions_tutorial
-   advanced/c_extension
+   recipes_index
+
+.. toctree::
+   :maxdepth: 1
+   :hidden:
 
+   prototype/prototype_index
diff --git a/intermediate_source/FSDP1_tutorial.rst b/intermediate_source/FSDP1_tutorial.rst
new file mode 100644
index 00000000000..b983879a449
--- /dev/null
+++ b/intermediate_source/FSDP1_tutorial.rst
@@ -0,0 +1,448 @@
+Getting Started with Fully Sharded Data Parallel(FSDP)
+======================================================
+
+**Author**: `Hamid Shojanazeri <https://github.com/HamidShojanazeri>`__, `Yanli Zhao <https://github.com/zhaojuanmao>`__, `Shen Li <https://mrshenli.github.io/>`__
+
+.. note::
+   FSDP1 is deprecated. Please check out `FSDP2 tutorial <https://docs.pytorch.org/tutorials/intermediate/FSDP_tutorial.html>`_.
+
+Training AI models at a large scale is a challenging task that requires a lot of compute power and resources.
+It also comes with considerable engineering complexity to handle the training of these very large models.
+`PyTorch FSDP <https://pytorch.org/blog/introducing-pytorch-fully-sharded-data-parallel-api/>`__, released in PyTorch 1.11 makes this easier.
+
+In this tutorial, we show how to use `FSDP APIs <https://pytorch.org/docs/stable/fsdp.html>`__, for simple MNIST models that can be extended to other larger models such as `HuggingFace BERT models <https://huggingface.co/blog/zero-deepspeed-fairscale>`__,
+`GPT 3 models up to 1T parameters <https://pytorch.medium.com/training-a-1-trillion-parameter-model-with-pytorch-fully-sharded-data-parallel-on-aws-3ac13aa96cff>`__ . The sample DDP MNIST code courtesy of `Patrick Hu <https://github.com/yqhu/>`_.
+
+
+How FSDP works
+--------------
+In `DistributedDataParallel <https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html>`__, (DDP) training, each process/ worker owns a replica of the model and processes a batch of data, finally it uses all-reduce to sum up gradients over different workers. In DDP the model weights and optimizer states are replicated across all workers. FSDP is a type of data parallelism that shards model parameters, optimizer states and gradients across DDP ranks.
+
+When training with FSDP, the GPU memory footprint is smaller than when training with DDP across all workers. This makes the training of some very large models feasible by allowing larger models or batch sizes to fit on device. This comes with the cost of increased communication volume. The communication overhead is reduced by internal optimizations like overlapping communication and computation.
+
+.. figure:: /_static/img/distributed/fsdp_workflow.png
+   :width: 100%
+   :align: center
+   :alt: FSDP workflow
+
+   FSDP Workflow
+
+At a high level FSDP works as follow:
+
+*In constructor*
+
+* Shard model parameters and each rank only keeps its own shard
+
+*In forward path*
+
+* Run all_gather to collect all shards from all ranks to recover the full parameter in this FSDP unit
+* Run forward computation
+* Discard parameter shards it has just collected
+
+*In backward path*
+
+* Run all_gather to collect all shards from all ranks to recover the full parameter in this FSDP unit
+* Run backward computation
+* Run reduce_scatter to sync gradients
+* Discard parameters.
+
+One way to view FSDP's sharding is to decompose the DDP gradient all-reduce into reduce-scatter and all-gather. Specifically, during the backward pass, FSDP reduces and scatters gradients, ensuring that each rank possesses a shard of the gradients. Then it updates the corresponding shard of the parameters in the optimizer step. Finally, in the subsequent forward pass, it performs an all-gather operation to collect and combine the updated parameter shards.
+
+.. figure:: /_static/img/distributed/fsdp_sharding.png
+   :width: 100%
+   :align: center
+   :alt: FSDP allreduce
+
+   FSDP Allreduce
+
+How to use FSDP
+---------------
+Here we use a toy model to run training on the MNIST dataset for demonstration purposes. The APIs and logic can be applied to training larger models as well.
+
+*Setup*
+
+1.1 Install PyTorch along with Torchvision
+
+See the `Get Started guide <https://pytorch.org/get-started/locally/>`__ for information on installation.
+
+We add the following code snippets to a python script “FSDP_mnist.py”.
+
+1.2  Import necessary packages
+
+.. note::
+    This tutorial is intended for PyTorch versions 1.12 and later. If you are using an earlier version, replace all instances of `size_based_auto_wrap_policy` with `default_auto_wrap_policy` and `fsdp_auto_wrap_policy` with `auto_wrap_policy`.
+
+.. code-block:: python
+
+    # Based on: https://github.com/pytorch/examples/blob/master/mnist/main.py
+    import os
+    import argparse
+    import functools
+    import torch
+    import torch.nn as nn
+    import torch.nn.functional as F
+    import torch.optim as optim
+    from torchvision import datasets, transforms
+
+
+    from torch.optim.lr_scheduler import StepLR
+
+    import torch.distributed as dist
+    import torch.multiprocessing as mp
+    from torch.nn.parallel import DistributedDataParallel as DDP
+    from torch.utils.data.distributed import DistributedSampler
+    from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+    from torch.distributed.fsdp.fully_sharded_data_parallel import (
+        CPUOffload,
+        BackwardPrefetch,
+    )
+    from torch.distributed.fsdp.wrap import (
+        size_based_auto_wrap_policy,
+        enable_wrap,
+        wrap,
+    )
+
+1.3 Distributed training setup. As we mentioned FSDP is a type of data parallelism which requires a distributed training environment, so here we use two helper functions to initialize the processes for distributed training and clean up.
+
+.. code-block:: python
+
+    def setup(rank, world_size):
+        os.environ['MASTER_ADDR'] = 'localhost'
+        os.environ['MASTER_PORT'] = '12355'
+
+        # initialize the process group
+        dist.init_process_group("nccl", rank=rank, world_size=world_size)
+
+    def cleanup():
+        dist.destroy_process_group()
+
+2.1  Define our toy model for handwritten digit classification.
+
+.. code-block:: python
+
+    class Net(nn.Module):
+        def __init__(self):
+            super(Net, self).__init__()
+            self.conv1 = nn.Conv2d(1, 32, 3, 1)
+            self.conv2 = nn.Conv2d(32, 64, 3, 1)
+            self.dropout1 = nn.Dropout(0.25)
+            self.dropout2 = nn.Dropout(0.5)
+            self.fc1 = nn.Linear(9216, 128)
+            self.fc2 = nn.Linear(128, 10)
+
+        def forward(self, x):
+
+            x = self.conv1(x)
+            x = F.relu(x)
+            x = self.conv2(x)
+            x = F.relu(x)
+            x = F.max_pool2d(x, 2)
+            x = self.dropout1(x)
+            x = torch.flatten(x, 1)
+            x = self.fc1(x)
+            x = F.relu(x)
+            x = self.dropout2(x)
+            x = self.fc2(x)
+            output = F.log_softmax(x, dim=1)
+            return output
+
+2.2 Define a train function
+
+.. code-block:: python
+
+    def train(args, model, rank, world_size, train_loader, optimizer, epoch, sampler=None):
+        model.train()
+        ddp_loss = torch.zeros(2).to(rank)
+        if sampler:
+            sampler.set_epoch(epoch)
+        for batch_idx, (data, target) in enumerate(train_loader):
+            data, target = data.to(rank), target.to(rank)
+            optimizer.zero_grad()
+            output = model(data)
+            loss = F.nll_loss(output, target, reduction='sum')
+            loss.backward()
+            optimizer.step()
+            ddp_loss[0] += loss.item()
+            ddp_loss[1] += len(data)
+
+        dist.all_reduce(ddp_loss, op=dist.ReduceOp.SUM)
+        if rank == 0:
+            print('Train Epoch: {} \tLoss: {:.6f}'.format(epoch, ddp_loss[0] / ddp_loss[1]))
+
+2.3 Define a validation function
+
+.. code-block:: python
+
+    def test(model, rank, world_size, test_loader):
+        model.eval()
+        correct = 0
+        ddp_loss = torch.zeros(3).to(rank)
+        with torch.no_grad():
+            for data, target in test_loader:
+                data, target = data.to(rank), target.to(rank)
+                output = model(data)
+                ddp_loss[0] += F.nll_loss(output, target, reduction='sum').item()  # sum up batch loss
+                pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
+                ddp_loss[1] += pred.eq(target.view_as(pred)).sum().item()
+                ddp_loss[2] += len(data)
+
+        dist.all_reduce(ddp_loss, op=dist.ReduceOp.SUM)
+
+        if rank == 0:
+            test_loss = ddp_loss[0] / ddp_loss[2]
+            print('Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.2f}%)\n'.format(
+                test_loss, int(ddp_loss[1]), int(ddp_loss[2]),
+                100. * ddp_loss[1] / ddp_loss[2]))
+
+2.4 Define a distributed train function that wraps the model in FSDP
+
+**Note: to save the FSDP model, we need to call the state_dict on each rank then on Rank 0 save the overall states.**
+
+.. code-block:: python
+
+    def fsdp_main(rank, world_size, args):
+        setup(rank, world_size)
+
+        transform=transforms.Compose([
+            transforms.ToTensor(),
+            transforms.Normalize((0.1307,), (0.3081,))
+        ])
+
+        dataset1 = datasets.MNIST('../data', train=True, download=True,
+                            transform=transform)
+        dataset2 = datasets.MNIST('../data', train=False,
+                            transform=transform)
+
+        sampler1 = DistributedSampler(dataset1, rank=rank, num_replicas=world_size, shuffle=True)
+        sampler2 = DistributedSampler(dataset2, rank=rank, num_replicas=world_size)
+
+        train_kwargs = {'batch_size': args.batch_size, 'sampler': sampler1}
+        test_kwargs = {'batch_size': args.test_batch_size, 'sampler': sampler2}
+        cuda_kwargs = {'num_workers': 2,
+                        'pin_memory': True,
+                        'shuffle': False}
+        train_kwargs.update(cuda_kwargs)
+        test_kwargs.update(cuda_kwargs)
+
+        train_loader = torch.utils.data.DataLoader(dataset1,**train_kwargs)
+        test_loader = torch.utils.data.DataLoader(dataset2, **test_kwargs)
+        my_auto_wrap_policy = functools.partial(
+            size_based_auto_wrap_policy, min_num_params=100
+        )
+        torch.cuda.set_device(rank)
+
+
+        init_start_event = torch.cuda.Event(enable_timing=True)
+        init_end_event = torch.cuda.Event(enable_timing=True)
+
+        model = Net().to(rank)
+
+        model = FSDP(model)
+
+        optimizer = optim.Adadelta(model.parameters(), lr=args.lr)
+
+        scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma)
+        init_start_event.record()
+        for epoch in range(1, args.epochs + 1):
+            train(args, model, rank, world_size, train_loader, optimizer, epoch, sampler=sampler1)
+            test(model, rank, world_size, test_loader)
+            scheduler.step()
+
+        init_end_event.record()
+
+        if rank == 0:
+            init_end_event.synchronize()
+            print(f"CUDA event elapsed time: {init_start_event.elapsed_time(init_end_event) / 1000}sec")
+            print(f"{model}")
+
+        if args.save_model:
+            # use a barrier to make sure training is done on all ranks
+            dist.barrier()
+            states = model.state_dict()
+            if rank == 0:
+                torch.save(states, "mnist_cnn.pt")
+
+        cleanup()
+
+
+
+2.5 Finally, parse the arguments and set the main function
+
+.. code-block:: python
+
+    if __name__ == '__main__':
+        # Training settings
+        parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
+        parser.add_argument('--batch-size', type=int, default=64, metavar='N',
+                            help='input batch size for training (default: 64)')
+        parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',
+                            help='input batch size for testing (default: 1000)')
+        parser.add_argument('--epochs', type=int, default=10, metavar='N',
+                            help='number of epochs to train (default: 14)')
+        parser.add_argument('--lr', type=float, default=1.0, metavar='LR',
+                            help='learning rate (default: 1.0)')
+        parser.add_argument('--gamma', type=float, default=0.7, metavar='M',
+                            help='Learning rate step gamma (default: 0.7)')
+        parser.add_argument('--no-cuda', action='store_true', default=False,
+                            help='disables CUDA training')
+        parser.add_argument('--seed', type=int, default=1, metavar='S',
+                            help='random seed (default: 1)')
+        parser.add_argument('--save-model', action='store_true', default=False,
+                            help='For Saving the current Model')
+        args = parser.parse_args()
+
+        torch.manual_seed(args.seed)
+
+        WORLD_SIZE = torch.cuda.device_count()
+        mp.spawn(fsdp_main,
+            args=(WORLD_SIZE, args),
+            nprocs=WORLD_SIZE,
+            join=True)
+
+
+We have recorded cuda events to measure the time of FSDP model specifics. The CUDA event time was 110.85 seconds.
+
+.. code-block:: bash
+
+    python FSDP_mnist.py
+
+    CUDA event elapsed time on training loop 40.67462890625sec
+
+Wrapping the model with FSDP, the model will look as follows, we can see the model has been wrapped in one FSDP unit.
+Alternatively, we will look at adding the auto_wrap_policy next and will discuss the differences.
+
+.. code-block:: bash
+
+    FullyShardedDataParallel(
+    (_fsdp_wrapped_module): FlattenParamsWrapper(
+        (_fpw_module): Net(
+        (conv1): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1))
+        (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
+        (dropout1): Dropout(p=0.25, inplace=False)
+        (dropout2): Dropout(p=0.5, inplace=False)
+        (fc1): Linear(in_features=9216, out_features=128, bias=True)
+        (fc2): Linear(in_features=128, out_features=10, bias=True)
+        )
+    )
+ )
+
+The following is the peak memory usage from FSDP MNIST training on g4dn.12.xlarge AWS EC2 instance with 4 GPUs captured from PyTorch Profiler.
+
+
+.. figure:: /_static/img/distributed/FSDP_memory.gif
+   :width: 100%
+   :align: center
+   :alt: FSDP peak memory
+
+   FSDP Peak Memory Usage
+
+Applying *auto_wrap_policy* in FSDP otherwise, FSDP will put the entire model in one FSDP unit, which will reduce computation efficiency and memory efficiency.
+The way it works is that, suppose your model contains 100 Linear layers. If you do FSDP(model), there will only be one FSDP unit which wraps the entire model.
+In that case, the allgather would collect the full parameters for all 100 linear layers, and hence won't save CUDA memory for parameter sharding.
+Also, there is only one blocking allgather call for the all 100 linear layers, there will not be communication and computation overlapping between layers.
+
+To avoid that, you can pass in an auto_wrap_policy, which will seal the current FSDP unit and start a new one automatically when the specified condition is met (e.g., size limit).
+In that way you will have multiple FSDP units, and only one FSDP unit needs to collect full parameters at a time. E.g., suppose you have 5 FSDP units, and each wraps 20 linear layers.
+Then, in the forward, the 1st FSDP unit will allgather parameters for the first 20 linear layers, do computation, discard the parameters and then move on to the next 20 linear layers. So, at any point in time, each rank only materializes parameters/grads for 20 linear layers instead of 100.
+
+
+To do so in 2.4 we define the auto_wrap_policy and pass it to FSDP wrapper, in the following example, my_auto_wrap_policy defines that a layer could be wrapped or sharded by FSDP if the number of parameters in this layer is larger than 100.
+If the number of parameters in this layer is smaller than 100, it will be wrapped with other small layers together by FSDP.
+Finding an optimal auto wrap policy is challenging, PyTorch will add auto tuning for this config in the future. Without an auto tuning tool, it is good to profile your workflow using different auto wrap policies experimentally and find the optimal one.
+
+.. code-block:: python
+
+    my_auto_wrap_policy = functools.partial(
+            size_based_auto_wrap_policy, min_num_params=20000
+        )
+    torch.cuda.set_device(rank)
+    model = Net().to(rank)
+
+    model = FSDP(model,
+        auto_wrap_policy=my_auto_wrap_policy)
+
+Applying the auto_wrap_policy, the model would be as follows:
+
+.. code-block:: bash
+
+    FullyShardedDataParallel(
+  (_fsdp_wrapped_module): FlattenParamsWrapper(
+    (_fpw_module): Net(
+      (conv1): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1))
+      (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
+      (dropout1): Dropout(p=0.25, inplace=False)
+      (dropout2): Dropout(p=0.5, inplace=False)
+      (fc1): FullyShardedDataParallel(
+        (_fsdp_wrapped_module): FlattenParamsWrapper(
+          (_fpw_module): Linear(in_features=9216, out_features=128, bias=True)
+        )
+      )
+      (fc2): Linear(in_features=128, out_features=10, bias=True)
+    )
+  )
+
+
+.. code-block:: bash
+
+    python FSDP_mnist.py
+
+    CUDA event elapsed time on training loop 41.89130859375sec
+
+The following is the peak memory usage from FSDP with auto_wrap policy of MNIST training on a g4dn.12.xlarge AWS EC2 instance with 4 GPUs captured from PyTorch Profiler.
+It can be observed that the peak memory usage on each device is smaller compared to FSDP without auto wrap policy applied, from ~75 MB to 66 MB.
+
+.. figure:: /_static/img/distributed/FSDP_autowrap.gif
+   :width: 100%
+   :align: center
+   :alt: FSDP peak memory
+
+   FSDP Peak Memory Usage using Auto_wrap policy
+
+*CPU Off-loading*: In case the model is very large that even with FSDP wouldn't fit into GPUs, then CPU offload can be helpful here.
+
+Currently, only parameter and gradient CPU offload is supported. It can be enabled via passing in cpu_offload=CPUOffload(offload_params=True).
+
+Note that this currently implicitly enables gradient offloading to CPU in order for params and grads to be on the same device to work with the optimizer. This API is subject to change. The default is None in which case there will be no offloading.
+
+Using this feature may slow down the training considerably, due to frequent copying of tensors from host to device, but it could help improve memory efficiency and train larger scale models.
+
+In 2.4 we just add it to the FSDP wrapper
+
+
+.. code-block:: python
+
+    model = FSDP(model,
+        auto_wrap_policy=my_auto_wrap_policy,
+        cpu_offload=CPUOffload(offload_params=True))
+
+
+Compare it with DDP, if in 2.4 we just normally wrap the model in DPP, saving the changes in “DDP_mnist.py”.
+
+.. code-block:: python
+
+    model = Net().to(rank)
+    model = DDP(model)
+
+
+.. code-block:: bash
+
+    python DDP_mnist.py
+
+    CUDA event elapsed time on training loop 39.77766015625sec
+
+The following is the peak memory usage from DDP MNIST training on g4dn.12.xlarge AWS EC2 instance with 4 GPUs captured from PyTorch profiler.
+
+.. figure:: /_static/img/distributed/DDP_memory.gif
+   :width: 100%
+   :align: center
+   :alt: FSDP peak memory
+
+   DDP Peak Memory Usage using Auto_wrap policy
+
+
+Considering the toy example and tiny MNIST model we defined here, we can observe the difference between peak memory usage of DDP and FSDP.
+In DDP each process holds a replica of the model, so the memory footprint is higher compared to FSDP which shards the model parameters, optimizer states and gradients over DDP ranks.
+The peak memory usage using FSDP with auto_wrap policy is the lowest followed by FSDP and DDP.
+
+Also, looking at timings, considering the small model and running the training on a single machine, FSDP with and without auto_wrap policy performed almost as fast as DDP.
+This example does not represent most of the real applications, for detailed analysis and comparison between DDP and FSDP please refer to this `blog post  <https://pytorch.medium.com/6c8da2be180d>`__ .
diff --git a/intermediate_source/FSDP_advanced_tutorial.rst b/intermediate_source/FSDP_advanced_tutorial.rst
new file mode 100644
index 00000000000..bf22e6efb50
--- /dev/null
+++ b/intermediate_source/FSDP_advanced_tutorial.rst
@@ -0,0 +1,719 @@
+Advanced Model Training with Fully Sharded Data Parallel (FSDP)
+===============================================================
+
+**Author**: `Hamid Shojanazeri <https://github.com/HamidShojanazeri>`__, `Less
+Wright <https://github.com/lessw2020>`__, `Rohan Varma
+<https://github.com/rohan-varma/>`__, `Yanli Zhao
+<https://github.com/zhaojuanmao>`__
+
+.. grid:: 2
+
+   .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn
+      :class-card: card-prerequisites
+
+      * PyTorch's Fully Sharded Data Parallel Module: A wrapper for sharding module parameters across
+      data parallel workers.
+
+
+
+
+   .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites
+      :class-card: card-prerequisites
+
+      * PyTorch 1.12 or later
+      * Read about the `FSDP API <https://pytorch.org/docs/main/fsdp.html>`__.
+
+
+This tutorial introduces more advanced features of Fully Sharded Data Parallel
+(FSDP) as part of the PyTorch 1.12 release. To get familiar with FSDP, please
+refer to the `FSDP getting started tutorial
+<https://pytorch.org/tutorials/intermediate/FSDP_tutorial.html>`__.
+
+In this tutorial, we fine-tune a HuggingFace (HF) T5 model with FSDP for text
+summarization as a working example.
+
+The example uses Wikihow and for simplicity, we will showcase the training on a
+single node, P4dn instance with 8 A100 GPUs. We now have several blog posts (
+`(link1), <https://pytorch.org/blog/introducing-pytorch-fully-sharded-data-parallel-api/>`__
+`(link2) <https://engineering.fb.com/2021/07/15/open-source/fsdp/>`__)
+and a `paper <https://arxiv.org/abs/2304.11277>`__ on
+large scale FSDP training on a multi-node cluster.
+
+FSDP is a production ready package with focus on ease of use, performance, and
+long-term support.  One of the main benefits of FSDP is reducing the memory
+footprint on each GPU. This enables training of larger models with lower total
+memory vs DDP, and leverages the overlap of computation and communication to
+train models efficiently.
+This reduced memory pressure can be leveraged to either train larger models or
+increase batch size, potentially helping overall training throughput.  You can
+read more about PyTorch FSDP `here
+<https://pytorch.org/blog/introducing-pytorch-fully-sharded-data-parallel-api/>`__.
+
+
+FSDP Features in This Tutorial
+------------------------------
+* Transformer Auto Wrap Policy
+* Mixed Precision
+* Initializing FSDP Model on Device
+* Sharding Strategy
+* Backward Prefetch
+* Model Checkpoint Saving via Streaming to CPU
+
+
+
+Recap on How FSDP Works
+-----------------------
+
+At a high level FDSP works as follow:
+
+*In the constructor*
+
+* Shard model parameters and each rank only keeps its own shard
+
+*In the forward pass*
+
+* Run `all_gather` to collect all shards from all ranks to recover the full
+  parameter for this FSDP unit and run the forward computation
+* Discard the non-owned parameter shards it has just collected to free memory
+
+*In the backward pass*
+
+* Run `all_gather` to collect all shards from all ranks to recover the full
+  parameter in this FSDP unit and run backward computation
+* Discard non-owned parameters to free memory.
+* Run reduce_scatter to sync gradients
+
+
+Fine-tuning HF T5
+-----------------
+HF T5 pre-trained models are available in four different sizes, ranging from
+small with 60 Million parameters to XXL with 11 Billion parameters. In this
+tutorial, we demonstrate the fine-tuning of a T5 3B with FSDP for text
+summarization using WikiHow dataset.  The main focus of this tutorial is to
+highlight different available features in FSDP that are helpful for training
+large scale model above 3B parameters. Also, we cover specific features for
+Transformer based models. The code for this tutorial is available in  `Pytorch
+examples
+<https://github.com/pytorch/examples/tree/main/distributed/FSDP/>`__.
+
+
+*Setup*
+
+1.1 Install the latest PyTorch
+
+.. code-block:: bash
+
+    pip3 install torch torchvision torchaudio
+
+1.2 Dataset Setup
+
+Please create a `data` folder, download the WikiHow dataset from `wikihowAll.csv
+<https://ucsb.app.box.com/s/ap23l8gafpezf4tq3wapr6u8241zz358>`__  and
+`wikihowSep.cs <https://ucsb.app.box.com/s/7yq601ijl1lzvlfu4rjdbbxforzd2oag>`__,
+and place them in the `data` folder.  We will use the wikihow dataset from
+`summarization_dataset
+<https://github.com/pytorch/examples/blob/main/distributed/FSDP/summarization_dataset.py>`__.
+
+Next, we add the following code snippets to a Python script “T5_training.py”.
+
+.. note::
+   The full source code for this tutorial is available in `PyTorch examples
+   <https://github.com/pytorch/examples/tree/main/distributed/FSDP/>`__.
+
+1.3  Import necessary packages:
+
+.. code-block:: python
+
+    import os
+    import argparse
+    import torch
+    import torch.nn as nn
+    import torch.nn.functional as F
+    import torch.optim as optim
+    from transformers import AutoTokenizer, GPT2TokenizerFast
+    from transformers import T5Tokenizer, T5ForConditionalGeneration
+    import functools
+    from torch.optim.lr_scheduler import StepLR
+    import torch.nn.functional as F
+    import torch.distributed as dist
+    import torch.multiprocessing as mp
+    from torch.nn.parallel import DistributedDataParallel as DDP
+    from torch.utils.data.distributed import DistributedSampler
+    from transformers.models.t5.modeling_t5 import T5Block
+
+    from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
+     checkpoint_wrapper,
+     CheckpointImpl,
+     apply_activation_checkpointing_wrapper)
+
+    from torch.distributed.fsdp import (
+        FullyShardedDataParallel as FSDP,
+        MixedPrecision,
+        BackwardPrefetch,
+        ShardingStrategy,
+        FullStateDictConfig,
+        StateDictType,
+    )
+    from torch.distributed.fsdp.wrap import (
+        transformer_auto_wrap_policy,
+        enable_wrap,
+        wrap,
+    )
+    from functools import partial
+    from torch.utils.data import DataLoader
+    from pathlib import Path
+    from summarization_dataset import *
+    from transformers.models.t5.modeling_t5 import T5Block
+    from typing import Type
+    import time
+    import tqdm
+    from datetime import datetime
+
+1.4 Distributed training setup.
+Here we use two helper functions to initialize the processes for distributed
+training,  and then to clean up after training completion.  In this tutorial, we
+are going to use torch elastic, using `torchrun
+<https://pytorch.org/docs/stable/elastic/run.html>`__ , which will set the
+worker `RANK` and `WORLD_SIZE` automatically.
+
+.. code-block:: python
+
+    def setup():
+        # initialize the process group
+        dist.init_process_group("nccl")
+
+    def cleanup():
+        dist.destroy_process_group()
+
+2.1  Set up the HuggingFace T5 model:
+
+.. code-block:: python
+
+    def setup_model(model_name):
+        model = T5ForConditionalGeneration.from_pretrained(model_name)
+        tokenizer =  T5Tokenizer.from_pretrained(model_name)
+        return model, tokenizer
+
+We also, add couple of helper functions here for date and formatting memory
+metrics.
+
+.. code-block:: python
+
+    def get_date_of_run():
+        """create date and time for file save uniqueness
+        example: 2022-05-07-08:31:12_PM'
+        """
+        date_of_run = datetime.now().strftime("%Y-%m-%d-%I:%M:%S_%p")
+        print(f"--> current date and time of run = {date_of_run}")
+        return date_of_run
+
+    def format_metrics_to_gb(item):
+        """quick function to format numbers to gigabyte and round to 4 digit precision"""
+        metric_num = item / g_gigabyte
+        metric_num = round(metric_num, ndigits=4)
+        return metric_num
+
+
+2.2 Define a train function:
+
+.. code-block:: python
+
+    def train(args, model, rank, world_size, train_loader, optimizer, epoch, sampler=None):
+        model.train()
+        local_rank = int(os.environ['LOCAL_RANK'])
+        fsdp_loss = torch.zeros(2).to(local_rank)
+
+        if sampler:
+            sampler.set_epoch(epoch)
+        if rank==0:
+            inner_pbar = tqdm.tqdm(
+                range(len(train_loader)), colour="blue", desc="r0 Training Epoch"
+            )
+        for batch in train_loader:
+            for key in batch.keys():
+                batch[key] = batch[key].to(local_rank)
+            optimizer.zero_grad()
+            output = model(input_ids=batch["source_ids"],attention_mask=batch["source_mask"],labels=batch["target_ids"] )
+            loss = output["loss"]
+            loss.backward()
+            optimizer.step()
+            fsdp_loss[0] += loss.item()
+            fsdp_loss[1] += len(batch)
+            if rank==0:
+                inner_pbar.update(1)
+
+        dist.all_reduce(fsdp_loss, op=dist.ReduceOp.SUM)
+        train_accuracy = fsdp_loss[0] / fsdp_loss[1]
+
+
+        if rank == 0:
+            inner_pbar.close()
+            print(
+                    f"Train Epoch: \t{epoch}, Loss: \t{train_accuracy:.4f}"
+                )
+        return train_accuracy
+
+2.3 Define a validation function:
+
+.. code-block:: python
+
+    def validation(model, rank, world_size, val_loader):
+        model.eval()
+        correct = 0
+        local_rank = int(os.environ['LOCAL_RANK'])
+        fsdp_loss = torch.zeros(3).to(local_rank)
+        if rank == 0:
+            inner_pbar = tqdm.tqdm(
+                range(len(val_loader)), colour="green", desc="Validation Epoch"
+            )
+        with torch.no_grad():
+            for batch in val_loader:
+                for key in batch.keys():
+                    batch[key] = batch[key].to(local_rank)
+                output = model(input_ids=batch["source_ids"],attention_mask=batch["source_mask"],labels=batch["target_ids"])
+                fsdp_loss[0] += output["loss"].item()  # sum up batch loss
+                fsdp_loss[1] += len(batch)
+
+                if rank==0:
+                    inner_pbar.update(1)
+
+        dist.all_reduce(fsdp_loss, op=dist.ReduceOp.SUM)
+        val_loss = fsdp_loss[0] / fsdp_loss[1]
+        if rank == 0:
+            inner_pbar.close()
+            print(f"Validation Loss: {val_loss:.4f}")
+        return val_loss
+
+
+2.4 Define a distributed train function that wraps the model in FSDP:
+
+
+.. code-block:: python
+
+
+    def fsdp_main(args):
+
+        model, tokenizer = setup_model("t5-base")
+
+        local_rank = int(os.environ['LOCAL_RANK'])
+        rank = int(os.environ['RANK'])
+        world_size = int(os.environ['WORLD_SIZE'])
+
+
+        dataset = load_dataset('wikihow', 'all', data_dir='data/')
+        print(dataset.keys())
+        print("Size of train dataset: ", dataset['train'].shape)
+        print("Size of Validation dataset: ", dataset['validation'].shape)
+
+
+        #wikihow(tokenizer, type_path, num_samples, input_length, output_length, print_text=False)
+        train_dataset = wikihow(tokenizer, 'train', 1500, 512, 150, False)
+        val_dataset = wikihow(tokenizer, 'validation', 300, 512, 150, False)
+
+        sampler1 = DistributedSampler(train_dataset, rank=rank, num_replicas=world_size, shuffle=True)
+        sampler2 = DistributedSampler(val_dataset, rank=rank, num_replicas=world_size)
+
+        setup()
+
+
+        train_kwargs = {'batch_size': args.batch_size, 'sampler': sampler1}
+        test_kwargs = {'batch_size': args.test_batch_size, 'sampler': sampler2}
+        cuda_kwargs = {'num_workers': 2,
+                        'pin_memory': True,
+                        'shuffle': False}
+        train_kwargs.update(cuda_kwargs)
+        test_kwargs.update(cuda_kwargs)
+
+        train_loader = torch.utils.data.DataLoader(train_dataset,**train_kwargs)
+        val_loader = torch.utils.data.DataLoader(val_dataset, **test_kwargs)
+
+        t5_auto_wrap_policy = functools.partial(
+            transformer_auto_wrap_policy,
+            transformer_layer_cls={
+                T5Block,
+            },
+        )
+        sharding_strategy: ShardingStrategy = ShardingStrategy.SHARD_GRAD_OP #for Zero2 and FULL_SHARD for Zero3
+        torch.cuda.set_device(local_rank)
+
+
+        #init_start_event = torch.cuda.Event(enable_timing=True)
+        #init_end_event = torch.cuda.Event(enable_timing=True)
+
+        #init_start_event.record()
+
+        bf16_ready = (
+        torch.version.cuda
+        and torch.cuda.is_bf16_supported()
+        and LooseVersion(torch.version.cuda) >= "11.0"
+        and dist.is_nccl_available()
+        and nccl.version() >= (2, 10)
+        )
+
+        if bf16_ready:
+            mp_policy = bfSixteen
+        else:
+            mp_policy = None # defaults to fp32
+
+        # model is on CPU before input to FSDP
+        model = FSDP(model,
+            auto_wrap_policy=t5_auto_wrap_policy,
+            mixed_precision=mp_policy,
+            #sharding_strategy=sharding_strategy,
+            device_id=torch.cuda.current_device())
+
+        optimizer = optim.AdamW(model.parameters(), lr=args.lr)
+
+        scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma)
+        best_val_loss = float("inf")
+        curr_val_loss = float("inf")
+        file_save_name = "T5-model-"
+
+        if rank == 0:
+            time_of_run = get_date_of_run()
+            dur = []
+            train_acc_tracking = []
+            val_acc_tracking = []
+            training_start_time = time.time()
+
+        if rank == 0 and args.track_memory:
+            mem_alloc_tracker = []
+            mem_reserved_tracker = []
+
+        for epoch in range(1, args.epochs + 1):
+            t0 = time.time()
+            train_accuracy = train(args, model, rank, world_size, train_loader, optimizer, epoch, sampler=sampler1)
+            if args.run_validation:
+                curr_val_loss = validation(model, rank, world_size, val_loader)
+            scheduler.step()
+
+            if rank == 0:
+
+                print(f"--> epoch {epoch} completed...entering save and stats zone")
+
+                dur.append(time.time() - t0)
+                train_acc_tracking.append(train_accuracy.item())
+
+                if args.run_validation:
+                    val_acc_tracking.append(curr_val_loss.item())
+
+                if args.track_memory:
+                    mem_alloc_tracker.append(
+                        format_metrics_to_gb(torch.cuda.memory_allocated())
+                    )
+                    mem_reserved_tracker.append(
+                        format_metrics_to_gb(torch.cuda.memory_reserved())
+                    )
+                print(f"completed save and stats zone...")
+
+            if args.save_model and curr_val_loss < best_val_loss:
+
+                # save
+                if rank == 0:
+                    print(f"--> entering save model state")
+
+                save_policy = FullStateDictConfig(offload_to_cpu=True, rank0_only=True)
+                with FSDP.state_dict_type(
+                    model, StateDictType.FULL_STATE_DICT, save_policy
+                ):
+                    cpu_state = model.state_dict()
+                #print(f"saving process: rank {rank}  done w state_dict")
+
+
+                if rank == 0:
+                    print(f"--> saving model ...")
+                    currEpoch = (
+                        "-" + str(epoch) + "-" + str(round(curr_val_loss.item(), 4)) + ".pt"
+                    )
+                    print(f"--> attempting to save model prefix {currEpoch}")
+                    save_name = file_save_name + "-" + time_of_run + "-" + currEpoch
+                    print(f"--> saving as model name {save_name}")
+
+                    torch.save(cpu_state, save_name)
+
+            if curr_val_loss < best_val_loss:
+
+                best_val_loss = curr_val_loss
+                if rank==0:
+                    print(f"-->>>> New Val Loss Record: {best_val_loss}")
+
+        dist.barrier()
+        cleanup()
+
+
+2.5 Parse the arguments and set the main function:
+
+.. code-block:: python
+
+
+    if __name__ == '__main__':
+        # Training settings
+        parser = argparse.ArgumentParser(description='PyTorch T5 FSDP Example')
+        parser.add_argument('--batch-size', type=int, default=4, metavar='N',
+                            help='input batch size for training (default: 64)')
+        parser.add_argument('--test-batch-size', type=int, default=4, metavar='N',
+                            help='input batch size for testing (default: 1000)')
+        parser.add_argument('--epochs', type=int, default=2, metavar='N',
+                            help='number of epochs to train (default: 3)')
+        parser.add_argument('--lr', type=float, default=.002, metavar='LR',
+                            help='learning rate (default: .002)')
+        parser.add_argument('--gamma', type=float, default=0.7, metavar='M',
+                            help='Learning rate step gamma (default: 0.7)')
+        parser.add_argument('--no-cuda', action='store_true', default=False,
+                            help='disables CUDA training')
+        parser.add_argument('--seed', type=int, default=1, metavar='S',
+                            help='random seed (default: 1)')
+        parser.add_argument('--track_memory', action='store_false', default=True,
+                            help='track the gpu memory')
+        parser.add_argument('--run_validation', action='store_false', default=True,
+                            help='running the validation')
+        parser.add_argument('--save-model', action='store_false', default=True,
+                            help='For Saving the current Model')
+        args = parser.parse_args()
+
+        torch.manual_seed(args.seed)
+
+        fsdp_main(args)
+
+
+To run the the training using torchrun:
+
+.. code-block:: bash
+
+    torchrun --nnodes 1 --nproc_per_node 4  T5_training.py
+
+.. _transformer_wrapping_policy:
+
+Transformer Wrapping Policy
+---------------------------
+
+As discussed in the `previous tutorial
+<https://pytorch.org/tutorials/intermediate/FSDP_tutorial.html>`__,
+auto_wrap_policy is one of the FSDP features that make it easy to automatically
+shard a given model and put the model, optimizer and gradient shards into
+distinct FSDP units.
+
+For some architectures such as Transformer encoder-decoders, some parts of the
+model such as embedding table is being shared with both encoder and decoder.  In
+this case, we need to place the embedding table in the outer FSDP unit so that
+it could be accessed from both encoder and decoder.  In addition, by registering
+the layer class for a transformer, the sharding plan can be made much more
+communication efficient.  In PyTorch 1.12, FSDP added this support and now we
+have a wrapping policy for transfomers.
+
+It can be created as follows, where the T5Block represents the T5 transformer
+layer class (holding MHSA and FFN).
+
+
+.. code-block:: python
+
+    t5_auto_wrap_policy = functools.partial(
+            transformer_auto_wrap_policy,
+            transformer_layer_cls={
+                T5Block,
+            },
+        )
+    torch.cuda.set_device(local_rank)
+
+
+    model = FSDP(model,
+        auto_wrap_policy=t5_auto_wrap_policy)
+
+To see the wrapped model, you can easily print the model and visually inspect
+the sharding and FSDP units as well.
+
+
+Mixed Precision
+---------------
+FSDP supports flexible mixed precision training allowing for arbitrary reduced
+precision types (such as fp16 or bfloat16). Currently BFloat16 is only available
+on Ampere GPUs, so you need to confirm native support before you use it. On
+V100s for example, BFloat16 can still be run but because it runs non-natively,
+it can result in significant slowdowns.
+
+To check if BFloat16 is natively supported, you can use the following :
+
+.. code-block:: python
+
+    bf16_ready = (
+        torch.version.cuda
+        and torch.cuda.is_bf16_supported()
+        and LooseVersion(torch.version.cuda) >= "11.0"
+        and dist.is_nccl_available()
+        and nccl.version() >= (2, 10)
+    )
+
+One of the advantages of mixed precision in FSDP is providing granular control
+over different precision levels for parameters, gradients, and buffers as
+follows:
+
+.. code-block:: python
+
+    fpSixteen = MixedPrecision(
+        param_dtype=torch.float16,
+        # Gradient communication precision.
+        reduce_dtype=torch.float16,
+        # Buffer precision.
+        buffer_dtype=torch.float16,
+    )
+
+    bfSixteen = MixedPrecision(
+        param_dtype=torch.bfloat16,
+        # Gradient communication precision.
+        reduce_dtype=torch.bfloat16,
+        # Buffer precision.
+        buffer_dtype=torch.bfloat16,
+    )
+
+    fp32_policy = MixedPrecision(
+        param_dtype=torch.float32,
+        # Gradient communication precision.
+        reduce_dtype=torch.float32,
+        # Buffer precision.
+        buffer_dtype=torch.float32,
+    )
+
+Note that if a certain type (parameter, reduce, buffer) is not specified, they
+will not be casted at all.
+
+This flexibility allows users fine grained control, such as only setting
+gradient communication to happen in reduced precision, and all parameters /
+buffer computation to be done in full precision. This is potentially useful in
+cases where intra-node communication is the main bottleneck and parameters /
+buffers must be in full precision to avoid accuracy issues. This can be done
+with the following policy:
+
+.. code-block:: bash
+
+    grad_bf16 = MixedPrecision(reduce_dtype=torch.bfloat16)
+
+
+In 2.4 we just add the relevant mixed precision policy to the FSDP wrapper:
+
+
+.. code-block:: python
+
+     model = FSDP(model,
+            auto_wrap_policy=t5_auto_wrap_policy,
+            mixed_precision=bfSixteen)
+
+In our experiments, we have observed up to 4x speed up by using BFloat16 for
+training and memory reduction of approximately 30% in some experiments that can
+be used for batch size increases.
+
+
+Intializing FSDP Model on Device
+--------------------------------
+In 1.12, FSDP supports a `device_id` argument meant to initialize input CPU
+module on the device given by `device_id`. This is useful when the entire model
+does not fit on a single GPU, but fits in a host's CPU memory. When `device_id`
+is specified, FSDP will move the model to the specified device on a per-FSDP
+unit basis, avoiding GPU OOM issues while initializing several times faster than
+CPU-based initialization:
+
+.. code-block:: python
+
+    torch.cuda.set_device(local_rank)
+
+     model = FSDP(model,
+            auto_wrap_policy=t5_auto_wrap_policy,
+            mixed_precision=bfSixteen,
+            device_id=torch.cuda.current_device())
+
+
+
+Sharding Strategy
+-----------------
+FSDP sharding strategy by default is set to fully shard the model parameters,
+gradients and optimizer states get sharded across all ranks. (also termed Zero3
+sharding). In case you are interested to have the Zero2 sharding strategy, where
+only optimizer states and gradients are sharded, FSDP support this feature by
+passing the Sharding strategy by using  "ShardingStrategy.SHARD_GRAD_OP",
+instead of "ShardingStrategy.FULL_SHARD" to the FSDP initialization  as follows:
+
+.. code-block:: python
+
+    torch.cuda.set_device(local_rank)
+
+     model = FSDP(model,
+            auto_wrap_policy=t5_auto_wrap_policy,
+            mixed_precision=bfSixteen,
+            device_id=torch.cuda.current_device(),
+            sharding_strategy=ShardingStrategy.SHARD_GRAD_OP # ZERO2)
+
+This will reduce the communication overhead in FSDP, in this case, it holds full
+parameters after forward and through the backwards pass.
+
+This saves an all_gather during backwards so there is less communication at the
+cost of a higher memory footprint. Note that full model params are freed at the
+end of backwards and all_gather will happen on the next forward pass.
+
+Backward Prefetch
+-----------------
+The backward prefetch setting controls the timing of when the next FSDP unit's
+parameters should be requested.  By setting it to `BACKWARD_PRE`, the next
+FSDP's unit params can begin to be requested and arrive sooner before the
+computation of the current unit starts. This overlaps the `all_gather`
+communication and gradient computation which can increase the training speed in
+exchange for slightly higher memory consumption. It can be utilized in the FSDP
+wrapper in 2.4 as follows:
+
+.. code-block:: python
+
+    torch.cuda.set_device(local_rank)
+
+     model = FSDP(model,
+            auto_wrap_policy=t5_auto_wrap_policy,
+            mixed_precision=bfSixteen,
+            device_id=torch.cuda.current_device(),
+            backward_prefetch = BackwardPrefetch.BACKWARD_PRE)
+
+`backward_prefetch` has two modes, `BACKWARD_PRE` and `BACKWARD_POST`.
+`BACKWARD_POST` means that the next FSDP unit's params will not be requested
+until the current FSDP unit processing is complete, thus minimizing memory
+overhead.  In some cases, using `BACKWARD_PRE` can increase model training speed
+up to 2-10%, with even higher speed improvements noted for larger models.
+
+Model Checkpoint Saving, by streaming to the Rank0 CPU
+------------------------------------------------------
+To save model checkpoints using FULL_STATE_DICT saving which saves model in the
+same fashion as a local model, PyTorch 1.12 offers a few utilities to support
+the saving of larger models.
+
+First, a FullStateDictConfig can be specified, allowing the state_dict to be
+populated on rank 0 only and offloaded to the CPU.
+
+When using this configuration, FSDP will allgather model parameters, offloading
+them to the CPU one by one, only on rank 0. When the state_dict is finally
+saved, it will only be populated on rank 0 and contain CPU tensors. This avoids
+potential OOM for models that are larger than a single GPU memory and allows
+users to checkpoint models whose size is roughly the available CPU RAM on the
+user's machine.
+
+This feature can be run as follows:
+
+.. code-block:: python
+
+    save_policy = FullStateDictConfig(offload_to_cpu=True, rank0_only=True)
+    with FSDP.state_dict_type(
+                model, StateDictType.FULL_STATE_DICT, save_policy
+            ):
+                cpu_state = model.state_dict()
+    if rank == 0:
+     save_name = file_save_name + "-" + time_of_run + "-" + currEpoch
+     torch.save(cpu_state, save_name)
+
+Summary
+-------
+
+In this tutorial, we have introduced many new features for FSDP available in
+Pytorch 1.12 and used HF T5 as the running example.  Using the proper wrapping
+policy especially for transformer models, along with mixed precision and
+backward prefetch should speed up your training runs. Also, features such as
+initializing the model on device, and checkpoint saving via streaming to CPU
+should help to avoid OOM error in dealing with large models.
+
+We are actively working to add new features to FSDP for the next release. If
+you have feedback, feature requests, questions or are encountering issues
+using FSDP, please feel free to contact us by opening an issue in the
+`PyTorch Github repository <https://github.com/pytorch/pytorch>`__.
diff --git a/intermediate_source/FSDP_tutorial.rst b/intermediate_source/FSDP_tutorial.rst
new file mode 100644
index 00000000000..075480adbf0
--- /dev/null
+++ b/intermediate_source/FSDP_tutorial.rst
@@ -0,0 +1,411 @@
+Getting Started with Fully Sharded Data Parallel (FSDP2)
+======================================================
+
+**Author**: `Wei Feng <https://github.com/weifengpy>`__, `Will Constable <https://github.com/wconstab>`__, `Yifan Mao <https://github.com/mori360>`__
+
+.. note::
+   |edit| Check out the code in this tutorial from `pytorch/examples <https://github.com/pytorch/examples/tree/main/distributed/FSDP2>`_. FSDP1 is deprecated. FSDP1 tutorials are archived in `[1] <https://docs.pytorch.org/tutorials/intermediate/FSDP1_tutorial.html>`_ and `[2] <https://docs.pytorch.org/tutorials/intermediate/FSDP_advanced_tutorial.html>`_
+
+How FSDP2 works
+--------------
+In `DistributedDataParallel <https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html>`__ (DDP) training, each rank owns a model replica and processes a batch of data, finally it uses all-reduce to sync gradients across ranks.
+
+Comparing with DDP, FSDP reduces GPU memory footprint by sharding model parameters, gradients, and optimizer states. It makes it feasible to train models that cannot fit on a single GPU. As shown below in the picture,
+
+* Outside of forward and backward computation, parameters are fully sharded
+* Before forward and backward, sharded parameters are all-gathered into unsharded parameters
+* Inside backward, local unsharded gradients are reduce-scatterred into sharded gradients
+* Optimizer updates sharded parameters with sharded gradients, resulting in sharded optimizer states
+
+.. figure:: /_static/img/distributed/fsdp_workflow.png
+   :width: 100%
+   :align: center
+   :alt: FSDP workflow
+
+
+FSDP can be considered a decomposition of DDP's all-reduce into reduce-scatter and all-gather operations
+
+.. figure:: /_static/img/distributed/fsdp_sharding.png
+   :width: 100%
+   :align: center
+   :alt: FSDP all-gather and reduce-scatter
+
+
+Comparing with `FSDP1
+<https://docs.pytorch.org/docs/stable/fsdp.html>`__, FSDP2 has following advantages:
+
+* Representing sharded parameters as `DTensor <https://docs.pytorch.org/docs/stable/distributed.tensor.html>`_ sharded on dim-i, allowing for easy manipulation of individual parameters, communication-free sharded state dicts, and a simpler meta-device initialization flow.
+* Improving memory management system that achieves lower and deterministic GPU memory by avoiding ``recordStream`` (`doc <https://dev-discuss.pytorch.org/t/fsdp-cudacachingallocator-an-outsider-newb-perspective/1486>`_) and does so without any CPU synchronization.
+* Offering a tensor subclass extension point to customize the all-gather, e.g. for float8 all-gather for float8 linears (`doc <https://dev-discuss.pytorch.org/t/enabling-float8-all-gather-in-fsdp2/2359>`_), and NF4 for QLoRA (`doc <https://github.com/pytorch/torchtune/blob/main/README.md>`_)
+* Mixing frozen and non-frozen parameters can in the same communication group without using extra memory.
+
+How to use FSDP2
+---------------
+
+Model Initialization
+~~~~~~~~~~~~~~~
+
+**Applying fully_shard on submodules**: Different from DDP, we should apply `fully_shard <https://docs.pytorch.org/docs/main/distributed.fsdp.fully_shard.html>`_ on submodules as well as the root model. In the transformer example below, we applied ``fully_shard`` on each layer first, then the root model
+
+* During forward computation of ``layers[i]``, the rest of the layers are sharded to reduce memory footprint
+* Inside ``fully_shard(model)``, FSDP2 excludes parameters from ``model.layers`` and classify remaining parameters into a parameter group for performant all-gather and reduce-scatter
+* ``fully_shard`` moves sharded model to actual training device (eg ``cuda``)
+
+
+**Command**: ``torchrun --nproc_per_node 2 train.py``
+
+.. code-block:: python
+
+    from torch.distributed.fsdp import fully_shard, FSDPModule
+    model = Transformer()
+    for layer in model.layers:
+        fully_shard(layer)
+    fully_shard(model)
+
+    assert isinstance(model, Transformer)
+    assert isinstance(model, FSDPModule)
+    print(model)
+    #  FSDPTransformer(
+    #    (tok_embeddings): Embedding(...)
+    #    ...
+    #    (layers): 3 x FSDPTransformerBlock(...)
+    #    (output): Linear(...)
+    #  )
+
+We can inspect the nested wrapping with ``print(model)``. ``FSDPTransformer`` is a joint class of `Transformer <https://github.com/pytorch/examples/blob/70922969e70218458d2a945bf86fd8cc967fc6ea/distributed/FSDP2/model.py#L100>`_ and `FSDPModule
+<https://docs.pytorch.org/docs/main/distributed.fsdp.fully_shard.html#torch.distributed.fsdp.FSDPModule>`_. The same thing happens to `FSDPTransformerBlock <https://github.com/pytorch/examples/blob/70922969e70218458d2a945bf86fd8cc967fc6ea/distributed/FSDP2/model.py#L76C7-L76C18>`_. All FSDP2 public APIs are exposed through ``FSDPModule``. For example, users can call ``model.unshard()`` to manually control all-gather schedules. See "explicit prefetching" below for details.
+
+**model.parameters() as DTensor**: ``fully_shard`` shards parameters across ranks, and convert ``model.parameters()`` from plain ``torch.Tensor`` to DTensor to represent sharded parameters. FSDP2 shards on dim-0 by default so DTensor placements are `Shard(dim=0)`. Say we have N ranks and a parameter with N rows before sharding. After sharding, each rank will have 1 row of the parameter. We can inspect sharded parameters using ``param.to_local()``.
+
+.. code-block:: python
+
+    from torch.distributed.tensor import DTensor
+    for param in model.parameters():
+        assert isinstance(param, DTensor)
+        assert param.placements == (Shard(0),)
+        # inspect sharded parameters with param.to_local()
+
+    optim = torch.optim.Adam(model.parameters(), lr=1e-2)
+
+Note the optimizer is constructed after applying ``fully_shard``. Both model and optimizer state dicts are represented in DTensor.
+
+DTensor facilitates optimizer, gradient clipping and checkpointing
+
+* ``torch.optim.Adam`` and ``torch.nn.utils.clip_grad_norm_`` works out of the box for DTensor parameters. It makes the code consistent between single-device and distributed training
+* we can use DTensor and DCP APIs to manipulate parameters to get full state dict, see "state dict" section below for details. For distributed state dicts, we can save/load checkpoints (`doc <https://docs.pytorch.org/tutorials/recipes/distributed_checkpoint_recipe.html>`_) without extra communication
+
+
+Forward/Backward with Prefetching
+~~~~~~~~~~~~~~~
+
+**command**: ``torchrun --nproc_per_node 2 train.py``
+
+.. code-block:: python
+
+    for _ in range(epochs):
+        x = torch.randint(0, vocab_size, (batch_size, seq_len), device=device)
+        loss = model(x).sum()
+        loss.backward()
+        optim.step()
+        optim.zero_grad()
+
+``fully_shard`` registers forward/backward hooks to all-gather parameters before computation, and reshards parameters after computation. To overlap all-gathers with computation, FSDP2 offers **implicit prefetching** that works out of the box with the training loop above and **explicit prefetching** for advanced users to control all-gather schedules manually.
+
+**Implicit Prefetching**: CPU thread issues all-gather i before layer i. All-gathers are queued into its own cuda stream while layer i computation happens in the default stream. For non-cpu-bound workload (eg Transformer with big batch size), all-gather i+1 can overlap with computation for layer i. Implicit prefetching works similarly in the backward, except all-gathers are issued in the reverse of post-forward order.
+
+.. figure:: /_static/img/distributed/fsdp_implicit.png
+   :width: 100%
+   :align: center
+   :alt: FSDP Implicit
+
+We recommend users to start with implicit prefetching to understand the performance out of the box.
+
+**Explicit Prefetching**: Users can specify forward ordering with `set_modules_to_forward_prefetch <https://docs.pytorch.org/docs/main/distributed.fsdp.fully_shard.html#torch.distributed.fsdp.FSDPModule.set_modules_to_forward_prefetch>`_, and backward ordering with `set_modules_to_backward_prefetch <https://docs.pytorch.org/docs/main/distributed.fsdp.fully_shard.html#torch.distributed.fsdp.FSDPModule.set_modules_to_backward_prefetch>`_. As shown in the code below, CPU thread issue all-gather i + 1 and i + 2 at layer i
+
+Explicit prefetching works well in following situation:
+
+**CPU-bound workload**: If using implicit prefetching, CPU thread will be too slow to issue all-gather for layer i+1 when kernels from layer i get executed. We have to explicitly issue all-gather i+1 before running forward for layer i
+
+**Prefetching for 2+ layers**: Implicit prefetching only all-gathers next one layer at a time to keep memory footprint minimum. With explicit prefetching can all-gather multiple layers at a time to possibly for better perf with increased memory. See ``layers_to_prefetch`` in the code
+
+**Issuing 1st all-gather earlier**: Implicit prefetching happens at the time of calling ``model(x)``. The 1st all-gather gets exposed. We can call `model.unshard() <https://docs.pytorch.org/docs/main/distributed.fsdp.fully_shard.html#torch.distributed.fsdp.FSDPModule.unshard>`_ explicitly earlier to issue 1st all-gather earlier
+
+**command**: ``torchrun --nproc_per_node 2 train.py --explicit-prefetching``
+
+.. code-block:: python
+
+    num_to_forward_prefetch = 2
+    for i, layer in enumerate(model.layers):
+        if i >= len(model.layers) - num_to_forward_prefetch:
+            break
+        layers_to_prefetch = [
+            model.layers[i + j] for j in range(1, num_to_forward_prefetch + 1)
+        ]
+        layer.set_modules_to_forward_prefetch(layers_to_prefetch)
+
+    num_to_backward_prefetch = 2
+    for i, layer in enumerate(model.layers):
+        if i < num_to_backward_prefetch:
+            continue
+        layers_to_prefetch = [
+            model.layers[i - j] for j in range(1, num_to_backward_prefetch + 1)
+        ]
+        layer.set_modules_to_backward_prefetch(layers_to_prefetch)
+
+    for _ in range(epochs):
+        # trigger 1st all-gather earlier
+        # this overlaps all-gather with any computation before model(x)
+        model.unshard()
+        x = torch.randint(0, vocab_size, (batch_size, seq_len), device=device)
+        loss = model(x).sum()
+        loss.backward()
+        optim.step()
+        optim.zero_grad()
+
+
+Enabling Mixed Precision
+~~~~~~~~~~~~~~~
+
+FSDP2 offers a flexible `mixed precision policy <https://docs.pytorch.org/docs/main/distributed.fsdp.fully_shard.html#torch.distributed.fsdp.MixedPrecisionPolicy>`_ to speed up training. One typical use case is
+
+* Casting float32 parameters to bfloat16 for forward/backward computation, see ``param_dtype=torch.bfloat16``
+* Upcasting gradients to float32 for reduce-scatter to preserve accuracy, see ``reduce_dtype=torch.float32``
+
+Comparing with `torch.amp <https://docs.pytorch.org/docs/stable/amp.html>`_, FSDP2 mixed precision has following advantages
+
+* **Performant and flexible parameter casting**: All the parameters inside a ``FSDPModule`` are cast together at the module boundary (before and after before/backward). We can set different mixed precision policies for each layer. For example, the first few layers can be in float32 while remaining layers can be in bfloat16.
+
+* **float32 gradient reduction (reduce-scatter)**: Gradients might vary a lot from rank to rank. Reducing gradients in float32 can be critical for numerics.
+
+
+
+**command**: ``torchrun --nproc_per_node 2 train.py --mixed-precision``
+
+.. code-block:: python
+
+    model = Transformer(model_args)
+    fsdp_kwargs = {
+        "mp_policy": MixedPrecisionPolicy(
+            param_dtype=torch.bfloat16,
+            reduce_dtype=torch.float32,
+        )
+    }
+    for layer in model.layers:
+        fully_shard(layer, **fsdp_kwargs)
+    fully_shard(model, **fsdp_kwargs)
+
+    # sharded parameters are float32
+    for param in model.parameters():
+        assert param.dtype == torch.float32
+
+    # unsharded parameters are bfloat16
+    model.unshard()
+    for param in model.parameters(recurse=False):
+        assert param.dtype == torch.bfloat16
+    model.reshard()
+
+    # optimizer states are in float32
+    optim = torch.optim.Adam(model.parameters(), lr=1e-2)
+
+    # training loop
+    # ...
+
+
+
+Gradient Clipping and Optimizer with DTensor
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+**command**: ``torchrun --nproc_per_node 2 train.py``
+
+.. code-block:: python
+
+    # optim is constructed base on DTensor model parameters
+    optim = torch.optim.Adam(model.parameters(), lr=1e-2)
+    for _ in range(epochs):
+        x = torch.randint(0, vocab_size, (batch_size, seq_len), device=device)
+        loss = model(x).sum()
+        loss.backward()
+        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=max_norm)
+        optim.step()
+        optim.zero_grad()
+
+Optimizer is initialized after applying ``fully_shard`` on the model, and holds reference to DTensor ``model.parameters()``. For gradient clipping, ``torch.nn.utils.clip_grad_norm_`` works for DTensor parameters. Tensor ops will be dispatched correctly inside DTensor to communicate partial tensors across ranks to preserve the single device semantic.
+
+
+State Dicts with DTensor APIs
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+We showcase how to convert a full state dict into a DTensor state dict for loading, and how to convert it back to full state dict for saving.
+
+**command**: ``torchrun --nproc_per_node 2 train.py``
+
+* For the 1st time, it creates checkpoints for the model and optimizer
+* For the 2nd time, it loads from the previous checkpoint to resume training
+
+**Loading state dicts**: We initialize the model under meta device and call ``fully_shard`` to convert ``model.parameters()`` from plain ``torch.Tensor`` to DTensor. After reading the full state dict from torch.load, we can call `distribute_tensor <https://docs.pytorch.org/docs/stable/distributed.tensor.html#torch.distributed.tensor.distribute_tensor>`_ to convert plain ``torch.Tensor`` into DTensor, using the same placements and device mesh from ``model.state_dict()``. Finally we can call `model.load_state_dict <https://docs.pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module.load_state_dict>`_ to load DTensor state dicts into the model.
+
+.. code-block:: python
+
+    from torch.distributed.tensor import distribute_tensor
+
+    # mmap=True reduces CPU memory usage
+    full_sd = torch.load(
+        "checkpoints/model_state_dict.pt",
+        mmap=True,
+        weights_only=True,
+        map_location='cpu',
+    )
+    meta_sharded_sd = model.state_dict()
+    sharded_sd = {}
+    for param_name, full_tensor in full_sd.items():
+        sharded_meta_param = meta_sharded_sd.get(param_name)
+        sharded_tensor = distribute_tensor(
+            full_tensor,
+            sharded_meta_param.device_mesh,
+            sharded_meta_param.placements,
+        )
+        sharded_sd[param_name] = nn.Parameter(sharded_tensor)
+    # `assign=True` since we cannot call `copy_` on meta tensor
+    model.load_state_dict(sharded_sd, assign=True)
+
+**Saving state dicts**: ``model.state_dict()`` returns a DTensor state dict. We can convert a DTensor into a plain ``torch.Tensor`` by calling `full_tensor() <https://docs.pytorch.org/docs/stable/distributed.tensor.html#torch.distributed.tensor.DTensor.full_tensor>`_. Internally it issues an all-gather across ranks to get unsharded parameters in plain torch.Tensor. For rank 0, ``full_param.cpu()`` offloads the tensor to cpu one by one to avoid peaking GPU memory with unsharded parameters.
+
+.. code-block:: python
+
+    sharded_sd = model.state_dict()
+    cpu_state_dict = {}
+    for param_name, sharded_param in sharded_sd.items():
+        full_param = sharded_param.full_tensor()
+        if torch.distributed.get_rank() == 0:
+            cpu_state_dict[param_name] = full_param.cpu()
+        else:
+            del full_param
+    torch.save(cpu_state_dict, "checkpoints/model_state_dict.pt")
+
+
+Optimizer state dict works similarly (`code <https://github.com/pytorch/examples/blob/70922969e70218458d2a945bf86fd8cc967fc6ea/distributed/FSDP2/checkpoint.py#L156>`_). Users can customize the above DTensor scripts to work with 3rd party checkpoints.
+
+If there is no need for customization, we can use `DCP APIs <https://docs.pytorch.org/docs/stable/distributed.checkpoint.html>`_ directly to support both single-node and multi-node training.
+
+
+State Dict with DCP APIs
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+**command**: ``torchrun --nproc_per_node 2 train.py --dcp-api``
+
+* For the 1st time, it creates checkpoints for the model and optimizer
+* For the 2nd time, it loads from the previous checkpoint to resume training
+
+**Loading state dicts**: We can load a full state dict into a FSDP2 model with `set_model_state_dict <https://docs.pytorch.org/docs/stable/distributed.checkpoint.html#torch.distributed.checkpoint.state_dict.set_model_state_dict>`_. With ``broadcast_from_rank0=True``, we can load the full state dict only on rank 0 to avoid peaking CPU memory. DCP will shard tensors and broadcast them to other ranks.
+
+.. code-block:: python
+
+    from torch.distributed.checkpoint.state_dict import set_model_state_dict
+    set_model_state_dict(
+        model=model,
+        model_state_dict=full_sd,
+        options=StateDictOptions(
+            full_state_dict=True,
+            broadcast_from_rank0=True,
+        ),
+    )
+
+**Saving state dicts**: `get_model_state_dict <https://docs.pytorch.org/docs/stable/distributed.checkpoint.html#torch.distributed.checkpoint.state_dict.get_model_state_dict>`_ with ``full_state_dict=True`` and ``cpu_offload=True`` all-gathers tensors and offload them to CPU. It works similarly to DTensor APIs.
+
+.. code-block:: python
+
+    from torch.distributed.checkpoint.state_dict import get_model_state_dict
+    model_state_dict = get_model_state_dict(
+        model=model,
+        options=StateDictOptions(
+            full_state_dict=True,
+            cpu_offload=True,
+        )
+    )
+    torch.save(model_state_dict, "model_state_dict.pt")
+
+
+Refer to `pytorch/examples <https://github.com/pytorch/examples/blob/main/distributed/FSDP2/checkpoint.py>`__ for loading and saving optimizer state dicts with `set_optimizer_state_dict <https://docs.pytorch.org/docs/stable/distributed.checkpoint.html#torch.distributed.checkpoint.state_dict.set_optimizer_state_dict>`_ and `get_optimizer_state_dict <https://docs.pytorch.org/docs/stable/distributed.checkpoint.html#torch.distributed.checkpoint.state_dict.get_optimizer_state_dict>`_.
+
+
+FSDP1-to-FSDP2 migration guide
+---------------
+
+Let’s look at an example of an `FSDP <https://docs.pytorch.org/docs/stable/fsdp.html>`_ usage and an equivalent `fully_shard <https://docs.pytorch.org/docs/main/distributed.fsdp.fully_shard.html>`_ usage.  We’ll highlight the key differences and suggest steps for migration.
+
+Original FSDP() usage
+
+.. code-block:: python
+
+    from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+    with torch.device("meta"):
+        model = Transformer()
+    policy = ModuleWrapPolicy({TransformerBlock})
+    model = FSDP(model, auto_wrap_policy=policy)
+    def param_init_fn(module: nn.Module) -> None: ...
+    model = FSDP(model, auto_wrap_policy=policy, param_init_fn=param_init_fn)
+
+New fully_shard() usage
+
+.. code-block:: python
+
+    with torch.device("meta"):
+        model = Transformer()
+    for module in model.modules():
+        if isinstance(module, TransformerBlock):
+            fully_shard(module)
+    fully_shard(model)
+    for tensor in itertools.chain(model.parameters(), model.buffers()):
+        assert tensor.device == torch.device("meta")
+
+
+    # Initialize the model after sharding
+    model.to_empty(device="cuda")
+    model.reset_parameters()
+
+Migration Steps
+
+* Replace the imports
+* Implement your ‘policy’ directly (apply ``fully_shard`` to the desired sublayers)
+* Wrap your root model with ``fully_shard`` instead of ``FSDP``
+* Get rid of ``param_init_fn`` and manually call ``model.reset_parameters()``
+* Replace other FSDP1 kwargs (see below)
+
+
+sharding_strategy
+
+* FULL_SHARD: ``reshard_after_forward=True``
+* SHARD_GRAD_OP: ``reshard_after_forward=False``
+* HYBRID_SHARD: ``reshard_after_forward=True`` with a 2D device mesh
+* _HYBRID_SHARD_ZERO2: ``reshard_after_forward=False`` with a 2D device mesh
+
+cpu_offload
+
+* CPUOffload.offload_params=False: ``offload_policy=None``
+* CPUOffload.offload_params = True: ``offload_policy=CPUOffloadPolicy()``
+
+backward_prefetch
+
+* BACKWARD_PRE: always used
+* BACKWARD_POST: not supported
+
+mixed_precision
+
+* ``buffer_dtype`` is omitted because fully_shard does not shard buffers
+* fully_shard’s ``cast_forward_inputs`` maps to both ``cast_forward_inputs`` and ``cast_root_forward_inputs`` in FSDP1
+* ``output_dtype`` is a new config for fully_shard
+
+device_id: Inferred from device_mesh’s device
+
+sync_module_states=True/False: Moved to DCP. User can broadcast state dicts from rank0 using `set_model_state_dict <https://docs.pytorch.org/docs/stable/distributed.checkpoint.html#torch.distributed.checkpoint.state_dict.set_model_state_dict>`_ with ``broadcast_from_rank0=True``
+
+forward_prefetch: Manual control over prefetching is possible with
+
+* Manually call `fsdp_module.unshard() <https://docs.pytorch.org/docs/main/distributed.fsdp.fully_shard.html#torch.distributed.fsdp.FSDPModule.unshard>`_
+* Use these APIs to control automatic prefetching, `set_modules_to_forward_prefetch <https://docs.pytorch.org/docs/main/distributed.fsdp.fully_shard.html#torch.distributed.fsdp.FSDPModule.set_modules_to_forward_prefetch>`_ and `set_modules_to_backward_prefetch <https://docs.pytorch.org/docs/main/distributed.fsdp.fully_shard.html#torch.distributed.fsdp.FSDPModule.set_modules_to_backward_prefetch>`_
+
+limit_all_gathers: No longer needed, because ``fully_shard`` removed cpu synchronization
+
+use_orig_params: Original params are always used (no more flat parameter)
+
+no_sync(): `set_requires_gradient_sync <https://docs.pytorch.org/docs/main/distributed.fsdp.fully_shard.html#torch.distributed.fsdp.FSDPModule.set_requires_gradient_sync>`_
+
+ignored_params and ignored_states: `ignored_params <https://docs.pytorch.org/docs/main/distributed.fsdp.fully_shard.html#torch.distributed.fsdp.fully_shard>`_
diff --git a/intermediate_source/README.txt b/intermediate_source/README.txt
index 43a2b0a726f..ecc8eb74af4 100644
--- a/intermediate_source/README.txt
+++ b/intermediate_source/README.txt
@@ -1,18 +1,34 @@
 Intermediate tutorials
 ----------------------
 
-1. char_rnn_classification_tutorial.py
+1. tensorboard_tutorial.py
 	Classifying Names with a Character-Level RNN
-	http://pytorch.org/tutorials/intermediate/char_rnn_classification_tutorial.html
+	https://pytorch.org/tutorials/intermediate/tensorboard_tutorial.html
 
-2. char_rnn_generation_tutorial.py
+2. char_rnn_classification_tutorial.py
+	Classifying Names with a Character-Level RNN
+	https://pytorch.org/tutorials/intermediate/char_rnn_classification_tutorial.html
+
+3. char_rnn_generation_tutorial.py
 	Generating Names with a Character-Level RNN
-	http://pytorch.org/tutorials/intermediate/char_rnn_generation_tutorial.html
+	https://pytorch.org/tutorials/intermediate/char_rnn_generation_tutorial.html
 
-3. seq2seq_translation_tutorial.py
+4. seq2seq_translation_tutorial.py
 	Translation with a Sequence to Sequence Network and Attention
-	http://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html
+	https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html
+
+5. reinforcement_q_learning.py
+	Reinforcement Learning (DQN) Tutorial
+	https://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html
+
+6. dist_tuto.rst
+	Writing Distributed Applications with PyTorch
+	https://pytorch.org/tutorials/intermediate/dist_tuto.html
+
+7. spatial_transformer_tutorial
+	Spatial Transformer Networks Tutorial
+	https://pytorch.org/tutorials/intermediate/spatial_transformer_tutorial.html
 
-4. reinforcement_q_learning.py
-	Reinforcement Learning (DQN) tutorial
-	http://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html
\ No newline at end of file
+8. nvfuser_intro_tutorial.py
+        Introduction to nvFuser
+	https://pytorch.org/tutorials/intermediate/nvfuser_intro_tutorial.html
diff --git a/intermediate_source/TCPStore_libuv_backend.rst b/intermediate_source/TCPStore_libuv_backend.rst
new file mode 100644
index 00000000000..1e285eba7c4
--- /dev/null
+++ b/intermediate_source/TCPStore_libuv_backend.rst
@@ -0,0 +1,286 @@
+Introduction to Libuv TCPStore Backend
+======================================
+**Authors**: `Xilun Wu <https://github.com/XilunWu>`_
+
+.. note::
+    |edit| View and edit this tutorial in `github <https://github.com/pytorch/tutorials/blob/main/intermediate_source/TCPStore_libuv_backend.rst>`__.
+
+.. grid:: 2
+
+   .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn
+      :class-card: card-prerequisites
+  
+      *  What is the new TCPStore backend
+      *  Compare the new libuv backend against the legacy backend
+      *  How to enable to use the legacy backend
+
+
+   .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites
+      :class-card: card-prerequisites
+
+      * PyTorch 2.4 or later
+      * Read about the `TCPStore API <https://pytorch.org/docs/main/distributed.html#torch.distributed.TCPStore>`__.
+
+
+Introduction
+------------
+
+Recently, we have rolled out a new TCPStore server backend using `libuv <https://github.com/libuv/libuv>`__, a third-party library for asynchronous I/O. This new server backend aims to
+address scalability and robustness challenges in large-scale distributed training jobs, such as those with more than 1024 ranks. We ran a series of
+benchmarks to compare the libuv backend against the old one, and the experiment results demonstrated significant improvements in store initialization
+time and maintained a comparable performance in store I/O operations.
+
+As a result of these findings, the libuv backend has been set as the default TCPStore server backend in PyTorch 2.4. This change is expected to enhance
+the performance and scalability of distributed training jobs.
+
+This change introduces a slight incompatibility to store initialization. For users who wish to continue using the legacy backend, the tutorial will
+provide guidance on how to specify to use the previous TCPStore server backend.
+
+
+Performance Benchmark
+---------------------
+
+To better demonstrate the benefit of our new libuv TCPStore backend, we set up a benchmark over a wide range of job size, from 1024 (1K) to 98304 (96K) ranks.
+We first measured the TCPStore initialization time using the code snippet below:
+
+.. code:: python
+
+    import logging
+    import os
+
+    from time import perf_counter
+
+    import torch
+    import torch.distributed as dist
+
+    logger: logging.Logger = logging.getLogger(__name__)
+
+    # Env var are preset when launching the benchmark
+    env_rank = os.environ.get("RANK", 0)
+    env_world_size = os.environ.get("WORLD_SIZE", 1)
+    env_master_addr = os.environ.get("MASTER_ADDR", "localhost")
+    env_master_port = os.environ.get("MASTER_PORT", "23456")
+
+    start = perf_counter()
+    tcp_store = dist.TCPStore(
+        env_master_addr,
+        int(env_master_port),
+        world_size=int(env_world_size),
+        is_master=(int(env_rank) == 0),
+    )
+    end = perf_counter()
+    time_elapsed = end - start
+    logger.info(
+        f"Complete TCPStore init with rank={env_rank}, world_size={env_world_size} in {time_elapsed} seconds."
+    )
+
+Since the execution of the TCPStore server thread will be blocked until all clients are successfully connected, we take the time measured on rank 0 as the total
+TCPStore initialization runtime. The experiment numbers are reported in the figure below:
+
+.. figure:: /_static/img/distributed/tcpstore_init_time.png
+   :width: 100%
+   :align: center
+   :alt: TCPStore Initialization Runtime Benchmark Result
+
+Figure 1. shows some significant evidence that the libuv backend is superior to the legacy backend:
+
+- TCPStore with libuv backend always has a faster initialization than the legacy backend, especially at super-large scale
+- The legacy backend would timeout at server-client connecting at 96K scale (for example, over 30 minutes) while the libuv backend completed the initialization in 100 seconds.
+
+The second benchmark we did is to measure the runtime of TCPStore ``store_based_barrier`` operation:
+
+.. code:: python
+
+    import logging
+    import os
+    import time
+
+    from datetime import timedelta
+    from time import perf_counter
+
+    import torch
+    import torch.distributed as dist
+
+    DistStoreError = torch._C._DistStoreError
+    logger: logging.Logger = logging.getLogger(__name__)
+
+    # since dist._store_based_barrier is a private function and cannot be directly called, we need to write a function which does the same
+    def store_based_barrier(
+        rank,
+        store,
+        group_name,
+        rendezvous_count,
+        timeout=dist.constants.default_pg_timeout,
+        logging_interval=timedelta(seconds=10),
+    ):
+        store_key = f"store_based_barrier_key:{group_name}"
+        store.add(store_key, 1)
+
+        world_size = rendezvous_count
+        worker_count = store.add(store_key, 0)
+
+        last_worker_key = f"{store_key}:last_worker"
+        if worker_count == world_size:
+            store.set(last_worker_key, "1")
+
+        start = time.time()
+        while True:
+            try:
+                # This will throw an exception after the logging_interval in which we print out
+                # the status of the group or time out officially, throwing runtime error
+                store.wait([last_worker_key], logging_interval)
+                break
+            except RuntimeError as e:
+                worker_count = store.add(store_key, 0)
+                # Print status periodically to keep track.
+                logger.info(
+                    "Waiting in store based barrier to initialize process group for "
+                    "rank: %s, key: %s (world_size=%s, num_workers_joined=%s, timeout=%s)"
+                    "error: %s",
+                    rank,
+                    store_key,
+                    world_size,
+                    worker_count,
+                    timeout,
+                    e,
+                )
+
+                if timedelta(seconds=(time.time() - start)) > timeout:
+                    raise DistStoreError(
+                        "Timed out initializing process group in store based barrier on "
+                        "rank {}, for key: {} (world_size={}, num_workers_joined={}, timeout={})".format(
+                            rank, store_key, world_size, worker_count, timeout
+                        )
+                    )
+
+        logger.info(
+            "Rank %s: Completed store-based barrier for key:%s with %s nodes.",
+            rank,
+            store_key,
+            world_size,
+        )
+
+    # Env var are preset when launching the benchmark
+    env_rank = os.environ.get("RANK", 0)
+    env_world_size = os.environ.get("WORLD_SIZE", 1)
+    env_master_addr = os.environ.get("MASTER_ADDR", "localhost")
+    env_master_port = os.environ.get("MASTER_PORT", "23456")
+
+    tcp_store = dist.TCPStore(
+        env_master_addr,
+        int(env_master_port),
+        world_size=int(env_world_size),
+        is_master=(int(env_rank) == 0),
+    )
+
+    # sync workers
+    store_based_barrier(int(env_rank), tcp_store, "tcpstore_test", int(env_world_size))
+
+    number_runs = 10
+    start = perf_counter()
+    for _ in range(number_runs):
+        store_based_barrier(
+            int(env_rank), tcp_store, "tcpstore_test", int(env_world_size)
+        )
+    end = perf_counter()
+    time_elapsed = end - start
+    logger.info(
+        f"Complete {number_runs} TCPStore barrier runs with rank={env_rank}, world_size={env_world_size} in {time_elapsed} seconds."
+    )
+
+We compute the average by dividing the runtime measured on rank 0 by ``number_runs`` and report it in the figure below:
+
+.. figure:: /_static/img/distributed/tcpstore_barrier_time.png
+   :width: 100%
+   :align: center
+   :alt: TCPStore Barrier Runtime Benchmark Result
+
+Figure 2. shows that the I/O performance of libuv backend is comparable to the legacy backend:
+
+- The libuv backend has a comparable performance over the whole spectrum in terms of the number of ranks
+- The libuv backend runtime is more stable than the legacy backend as the number of ranks grows
+
+
+Impact
+------
+
+One incompatibility that users may need to pay attention is, TCPStore currently does not support initialization with a ``listen_fd`` when using libuv backend.
+If the user wants to keep using this initialization method, the user can simply pass ``use_libuv=False`` to stay with the old TCPStore backend.
+
+.. code:: python
+
+    import socket
+
+    import torch
+    import torch.distributed as dist
+
+    listen_sock: socket.socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+    listen_sock.bind(("localhost", 0))
+    addr, port, *_ = listen_sock.getsockname()
+    listen_fd = listen_sock.detach()
+
+    tcpstore = dist.TCPStore(addr, port, 1, True, master_listen_fd=listen_fd)  # expect NotImplementedError
+    tcpstore = dist.TCPStore(addr, port, 1, True, master_listen_fd=listen_fd, use_libuv=False)  # OK. Use legacy backend
+
+
+Exit Route 1: Pass ``use_libuv=False`` to TCPStore Initialization
+-----------------------------------------------------------------
+
+As the above code snippet shows, if user calls TCPStore init method to create a store, simply passing ``use_libuv=False`` allows user to remain using the old
+TCPStore backend. This override has the highest priority over other approaches determining which backend the TCPStore server should choose.
+
+
+Exit Route 2: Add ``use_libuv=0`` to ``init_method`` at ProcessGroup Initialization
+-----------------------------------------------------------------------------------
+
+``ProcessGroup`` creates a TCPStore if user does not explicitly pass one to its initialization. User can add the query option ``use_libuv=0`` to ``init_method`` when
+initializing the ``ProcessGroup``. This approach has lower priority than Exit Route 1.
+
+.. code:: python
+
+    import torch
+    import torch.distributed as dist
+
+    addr = "localhost"
+    port = 23456
+    dist.init_process_group(
+        backend="cpu:gloo,cuda:nccl",
+        rank=0,
+        world_size=1,
+        init_method=f"tcp://{addr}:{port}?use_libuv=0",
+    )
+    dist.destroy_process_group()
+
+
+Exit Route 3: Set Environment Variable ``USE_LIBUV`` to ``0``
+-------------------------------------------------------------
+
+When ProcessGroup creates a TCPStore, it also checks the environment vairable ``USE_LIBUV`` to determine which TCPStore backend to use. User can set the environment
+variable ``"USE_LIBUV"`` to ``"0"`` to specify the use of old TCPStore backend. This approach has lower priority than Exit Route 2, for example, if the user sets environment
+variable ``USE_LIBUV`` to ``1`` and also passes ``use_libuv=0`` in ``init_method``, then the old store backend will be chosen.
+
+.. code:: python
+
+    import os
+
+    import torch
+    import torch.distributed as dist
+
+    addr = "localhost"
+    port = 23456
+    os.environ["USE_LIBUV"] = "0"
+    dist.init_process_group(
+        backend="cpu:gloo,cuda:nccl",
+        rank=0,
+        world_size=1,
+        init_method=f"tcp://{addr}:{port}",
+    )
+    dist.destroy_process_group()
+
+
+Conclusion
+----------
+In PyTorch 2.4, we made the new libuv TCPStore backend the default. Although the new backend has incompatibility with initialization from a ``listen_fd``, it
+shows significant performance improvement on store initialization at large-scale and compatible performance on store I/O at small/medium/large scales, which
+brings a major benefit to Distributed Training's control plane. This tutorial explains our motivation, goes through the performance benchmark, notifies users
+of the potential impact, and introduces three exit routes to remain using the legacy backend. In the long term, we aim to eventually deprecate the legacy backend.
diff --git a/intermediate_source/TP_tutorial.rst b/intermediate_source/TP_tutorial.rst
new file mode 100644
index 00000000000..6d3e7b60c68
--- /dev/null
+++ b/intermediate_source/TP_tutorial.rst
@@ -0,0 +1,356 @@
+Large Scale Transformer model training with Tensor Parallel (TP)
+======================================================
+
+**Author**: `Wanchao Liang <https://github.com/wanchaol>`__, `Tianyu Liu <https://github.com/tianyu-l>`__
+
+.. note::
+   |edit| View and edit this tutorial in `github <https://github.com/pytorch/tutorials/blob/main/intermediate_source/TP_tutorial.rst>`__.
+
+This tutorial demonstrates how to train a large Transformer-like model across hundreds to thousands of GPUs using Tensor Parallel and Fully Sharded Data Parallel.
+
+Prerequisites:
+
+- PyTorch 2.3.0 or later installed with CUDA/Linux
+-  `Tensor Parallel APIs <https://pytorch.org/docs/stable/distributed.tensor.parallel.html>`__
+-  `Getting Started with DeviceMesh <https://pytorch.org/tutorials/recipes/distributed_device_mesh.html>`__
+-  `Getting Started with Fully Sharded Data Parallel <https://pytorch.org/tutorials/intermediate/FSDP_tutorial.html>`__
+
+
+How Tensor Parallel works?
+-----------
+Tensor Parallel (TP) was originally proposed in the `Megatron-LM <https://arxiv.org/abs/1909.08053>`__ paper,
+and it is an efficient model parallelism technique to train large scale Transformer models.
+`Sequence Parallel <https://arxiv.org/abs/2205.05198>`__ (SP) we mention in this tutorial is a variant of Tensor
+Parallel that shards on the sequence dimension for ``nn.LayerNorm`` or ``RMSNorm`` to further save activation memory
+during training. As the model becomes larger, the activation memory becomes the bottleneck, so in Tensor
+Parallel training it usually applies Sequence Parallel to ``LayerNorm`` or ``RMSNorm`` layers.
+
+.. figure:: /_static/img/distributed/megatron_lm.png
+   :width: 100%
+   :align: center
+   :alt: Megatron-LM TP
+
+   Figure 1. represents the sharding in Tensor Parallel style on a Transformer model’s MLP and Self-Attention layer, where the matrix multiplications in both attention/MLP happens through sharded computations (`image source <https://arxiv.org/abs/1909.08053>`__)
+
+
+At a high level, PyTorch Tensor Parallel works as follows:
+
+**Sharding initialization**
+
+* Determine which ``ParallelStyle`` to apply to each layer and shard the initialized module by calling ``parallelize_module``.
+* The parallelized modules would have their model parameters be swapped to DTensors, and DTensor would be responsible to run the parallelized module using sharded computation.
+
+**Runtime foward/backward**
+
+* Depending on the input/outputs DTensor layouts user specified for each ``ParallelStyle``, it would run proper communication operation to transform the DTensor layouts for inputs/outputs (such as ``allreduce``, ``allgather`` and ``reduce_scatter``).
+* Run sharded computation for the parallelized layers to save compute/memory (for example, ``nn.Linear``, ``nn.Embedding``).
+
+
+When and Why you should apply Tensor Parallel
+---------------------------------------------
+The PyTorch Fully Sharded Data Parallel (FSDP) already has the capability to scale model training to a specific
+number of GPUs. However, when it comes to further scale the model training in terms of model size and GPU quantity,
+many additional challenges arise that may require combining Tensor Parallel with FSDP.:
+
+1. As the world size (number of GPUs) is becoming excessively large (exceeding 128/256 GPUs), the FSDP collectives (such as ``allgather``) are being dominated by ring latency.
+   By implementing TP/SP on top of FSDP, the FSDP world size could be reduced by 8 by applying FSDP to be inter-host only, consequently decreasing the latency costs by the same amount.
+2. Hit data parallelism limit where you can not raise the global batch size to be above the number of GPUs due to both convergence and GPU memory limitations, Tensor/Sequence Parallel
+   is the only known way to “ballpark” the global batch size and continue scaling with more GPUs. This means both model size and number of GPUs could continue to scale.
+3. For certain types of models, when local batch size becomes smaller, TP/SP can yield matrix multiplication shapes that are more optimized for floating point operations (FLOPS).
+
+So, when pre-training, how easy is it to hit those limits? As of now, pre-training a Large Language Model (LLM) with billions or trillions of tokens could take months, even when using thousands of GPUs.
+
+* It will always hit limitation 1 when training LLM on a large scale. For example, Llama 2 70B trained with 2k GPUs for 35 days, multi-dimensional parallelisms are needed at 2k scale.
+* When the Transformer model becomes larger (such as Llama2 70B), it will also quickly hit the limitation 2. One could not use FSDP alone with even local ``batch_size=1`` due to memory
+  and convergence constraints. For example, Llama 2 global batch size is 1K, so data parallelism alone can not be used at 2K GPUs.
+
+
+How to apply Tensor Parallel
+----------------------------
+
+PyTorch Tensor Parallel APIs offers a set of module level primitives (``ParallelStyle``) to configure the sharding for each individual layers of the model, including:
+
+* ``ColwiseParallel`` and ``RowwiseParallel``: Shard the ``nn.Linear`` and ``nn.Embedding`` in the column or row fashion.
+* ``SequenceParallel``: Perform sharded computations on ``nn.LayerNorm``, ``nn.Dropout``, ``RMSNormPython``, etc.
+* ``PrepareModuleInput`` and ``PrepareModuleOutput``: Configure the module inputs/outputs sharding layouts with proper communication operations.
+
+To demonstrate how to use the PyTorch native Tensor Parallel APIs, let us look at a common Transformer model. In this tutorial, we use the most recent `Llama2 model <https://github.com/pytorch/examples/blob/main/distributed/tensor_parallelism/llama2_model.py>`__ as a reference Transformer model implementation, as it is also widely used in the community.
+
+Since Tensor Parallel shard individual tensors over a set of devices, we would need to set up the distributed environment (such as NCCL communicators) first.
+Tensor Parallelism is a Single-Program Multiple-Data (SPMD) sharding algorithm similar to PyTorch DDP/FSDP, and it under the hood leverages the PyTorch DTensor
+to perform sharding. It also utilizes the DeviceMesh abstraction (which under the hood manages ProcessGroups) for device management and sharding.
+To see how to utilize DeviceMesh to set up multi-dimensional parallelisms, please refer to `this tutorial <https://pytorch.org/tutorials/recipes/distributed_device_mesh.html>`__. Tensor Parallel usually works within each host, so let us first initialize a DeviceMesh that connects 8 GPUs within a host.
+
+.. code-block:: python
+
+    from torch.distributed.device_mesh import init_device_mesh
+
+    tp_mesh = init_device_mesh("cuda", (8,))
+
+
+Now that we have initialized DeviceMesh, let us take a detailed look at the Llama 2 model architecture and see how we should perform the Tensor Parallel sharding.
+Here we focus on the core ``TransformerBlock``, where the Transformer model stacks the identical ``TransformerBlock`` s to scale up the model.
+
+The core ``TransformerBlock`` consists of an ``Attention`` layer and a ``FeedForward`` layer. Let us first look at the simpler ``FeedForward`` layer.
+For the ``FeedForward`` Layer it consists of three Linear layers, where it performs a SwiGLU style MLP, looking at its forward function:
+
+.. code-block:: python
+
+    # forward in the FeedForward layer
+    def forward(self, x):
+        return self.w2(F.silu(self.w1(x)) * self.w3(x))
+
+
+It performs ``w1`` and ``w3`` matmuls concurrently and followed by a ``w2`` matmul with the result of the combined w1/w3 linear projection results. This means we could
+use the idea from the Tensor Parallelism paper to shard the w1/w3 Linear layers in the colwise fashion and shard the ``w2`` Linear layer in the rowwise fashion, so that
+there is only one ``allreduce`` communication happening at the end of all the three layers. With the PyTorch native Tensor Parallel, we can simply create a ``parallelize_plan`` for the ``FeedForward`` layer like below:
+
+.. code-block:: python
+
+    from torch.distributed.tensor.parallel import ColwiseParallel, RowwiseParallel, parallelize_module
+
+    layer_tp_plan = {
+        # by default ColwiseParallel input layouts is replicated
+        # and RowwiseParallel output layouts is replicated
+        "feed_foward.w1": ColwiseParallel(),
+        "feed_forward.w2": RowwiseParallel(),
+        "feed_forward.w3": ColwiseParallel(),
+    }
+
+
+That's simply how we configure the shardings for the ``FeedForward`` layer using the PyTorch Tensor Parallel APIs. Note that users would only need to specify how to shard the individual layers and the communications (for example, ``allreduce``) will happen under the hood.
+
+Moving on to the ``Attention`` Layer. It consists of ``wq``, ``wk``, ``wv`` Linear layers to project input to ``q``/ ``k`` / ``v``, and then it performs attention and output projection with the ``wo`` Linear layer. Tensor Parallelism here intends to perform column-wise sharding for the
+q/k/v projection and row-wise sharding for the ``wo`` linear projection. So we can add the Attention plan to the ``tp_plan`` that we just drafted up:
+
+.. code-block:: python
+
+    layer_tp_plan = {
+        # by default ColwiseParallel input layouts is replicated
+        # and RowwiseParallel output layouts is replicated
+        "attention.wq": ColwiseParallel(use_local_output=False),
+        "attention.wk": ColwiseParallel(use_local_output=False),
+        "attention.wv": ColwiseParallel(use_local_output=False),
+        "attention.wo": RowwiseParallel(),
+        "feed_forward.w1": ColwiseParallel(),
+        "feed_forward.w2": RowwiseParallel(),
+        "feed_forward.w3": ColwiseParallel(),
+    }
+
+
+This is almost the ``layer_tp_plan`` we need to apply Tensor Parallelism to the ``TransformerBlock``. However, one thing we should be aware is that when sharding the linear layer column-wise, the output of the linear layers would become sharded on the last tensor dimension, and the row-wise sharding linear layer directly accepts an input that shards on the last dimension.
+If there are any more tensor operations (such as view operations) between the column-wise linear and the row-wise linear, we would need to adjust the relevant shape related ops to sharded shape.
+
+For the Llama model, in the attention layer, there are several view operations related to shape. Specifically, for column-wise parallelism in the ``wq``/``wk``/``wv`` linear layers, the activation tensor is sharded on the ``num_heads`` dimension. To manage the difference between global and local ``num_heads``, we should set ``use_local_output=False`` to ensure the output is a DTensor. Unlike a regular tensor, a DTensor is aware of the parallelism plans and will automatically handle changes in the ``num_heads`` dimension.
+
+Finally, we need to call ``parallelize_module`` API to make the plan for each ``TransformerBlock`` effective. Under the hood, it distributes the model parameters inside ``Attention`` and ``FeedForward`` layers to DTensors, and registers communication hooks for model inputs and outputs (before and after each module respectively), if necessary:
+
+.. code-block:: python
+
+    for layer_id, transformer_block in enumerate(model.layers):
+        layer_tp_plan = {...}  # i.e. the plan we just generated
+
+        parallelize_module(
+            module=transformer_block,
+            device_mesh=tp_mesh,
+            parallelize_plan=layer_tp_plan,
+        )
+
+Now that we have elaborated the sharding plan for each ``TransformerBlock``, there is usually a ``nn.Embedding`` in the first layer and a final ``nn.Linear`` projection layer, where user could choose row-wise or column-wise sharding to the first ``nn.Embedding`` and column-wise sharding to the last ``nn.Linear`` projection layer with proper input and output layouts specified.
+Here is an example:
+
+.. code-block:: python
+
+    model = parallelize_module(
+        model,
+        tp_mesh,
+        {
+            "tok_embeddings": RowwiseParallel(
+                input_layouts=Replicate(),
+            ),
+            "output": ColwiseParallel(
+                output_layouts=Replicate(),
+            ),
+        }
+    )
+
+.. note::
+	If the model to be partitioned is too large to fit into CPU memory, one could either use ``meta`` device initialization (for example, initialize the model on meta device first, shard the layers, and the materialize the model), or parallelize the ``TransformerBlock`` layer by layer during the Transformer model initialization.
+
+Apply Sequence Parallel to ``LayerNorm/RMSNorm`` layers
+-------------------------------------------------------
+
+Sequence Parallel works on top of the Tensor Parallel illustrated above. Compared with basic Tensor Parallel, which only shards tensors within the ``Attention`` modules and ``FeedForward`` modules and keep their module inputs and outputs (namely activations in the forward pass and gradients in the backward pass) replicated, Sequence Parallel keeps them sharded on the sequence dimension.
+
+In a typical ``TransformerBlock``, the forward function combines norm layers (``LayerNorm`` or ``RMSNorm``), an attention layer, a feed forward layer, and residual connections. For example:
+
+.. code-block:: python
+
+    # forward in a TransformerBlock
+    def forward(self, x):
+        h = x + self.attention(self.attention_norm(x))
+        out = h + self.feed_forward(self.ffn_norm(h))
+        return out
+
+In most use cases, the activations (and gradients) are of the shape ``[batch size, sequence length, hidden dimension]`` outside the ``Attention`` and ``FeedForward`` modules. In the DTensor’s language, Sequence Parallel performs activation computation using the ``Shard(1)`` layout for both forward/backward of the module.
+Following the code example earlier, the code below demonstrates how we apply Sequence Parallel to the norm layers within a ``TransformerBlock``:
+
+First let's import the required dependencies for Sequence Parallel:
+
+.. code-block:: python
+
+    from torch.distributed.tensor.parallel import (
+        PrepareModuleInput,
+        SequenceParallel,
+    )
+
+
+Next let's adjust the ``layer_tp_plan`` to enable sequence parallel on the ``RMSNorm`` layers:
+
+.. code-block:: python
+
+    layer_tp_plan = {
+        # Now the input and output of SequenceParallel has Shard(1) layouts,
+        # to represent the input/output tensors sharded on the sequence dimension
+        "attention_norm": SequenceParallel(),
+        "attention": PrepareModuleInput(
+            input_layouts=(Shard(1), Replicate()),
+            desired_input_layouts=(Replicate(), Replicate()),
+        ),
+        "attention.wq": ColwiseParallel(use_local_output=False),
+        "attention.wk": ColwiseParallel(use_local_output=False),
+        "attention.wv": ColwiseParallel(use_local_output=False),
+        "attention.wo": RowwiseParallel(output_layouts=Shard(1)),
+        "ffn_norm": SequenceParallel(),
+        "feed_forward": PrepareModuleInput(
+            input_layouts=(Shard(1),),
+            desired_input_layouts=(Replicate(),),
+        ),
+        "feed_forward.w1": ColwiseParallel(),
+        "feed_forward.w2": RowwiseParallel(output_layouts=Shard(1)),
+        "feed_forward.w3": ColwiseParallel(),
+    }
+
+
+One can see we now use ``PrepareModuleInput`` to modify the module input layouts to the Attention and FeedForward layers from ``Shard(1)`` to ``Replicate()``, and mark their output layouts as ``Shard(1)``.
+Just like what happens to Tensor Parallelism, one only needs to specify the tensor sharding layouts of the inputs and outputs, and the communication between layers will happen automatically.
+
+Note that with Sequence Parallel, we assume the inputs and outputs of a ``TransformerBlock`` are always sharded on the sequence dimension, so that multiple ``TransformerBlocks`` can be concatenated seamlessly.
+This can be facilitated by explicitly specifying the output of the beginning ``nn.Embedding`` layer and the input of the final ``nn.Linear`` projection layer to be ``Shard(1)``:
+
+.. code-block:: python
+
+    model = parallelize_module(
+        model,
+        tp_mesh,
+        {
+            "tok_embeddings": RowwiseParallel(
+                input_layouts=Replicate(),
+                output_layouts=Shard(1),
+            ),
+            "norm": SequenceParallel(),
+            "output": ColwiseParallel(
+                input_layouts=Shard(1),
+                output_layouts=Replicate()
+            ),
+        }
+    )
+
+
+Apply Loss Parallel
+-------------------
+
+Loss Parallel is a related technique to save memory and communication when the loss function is computed, as model outputs are usually very large. In Loss Parallel, when the model outputs are sharded on the (often huge) vocabulary dimension, the cross-entropy loss can be computed efficiently, without gathering all the model outputs to every single GPU. This not only significantly reduces the memory consumption, but also improves training speed by reducing communication overhead and doing sharded computation in parallel. The picture below briefly illustrates how Loss Parallel avoids gathering all model outputs to every GPU by doing sharded computation.
+
+.. figure:: /_static/img/distributed/loss_parallel.png
+   :width: 100%
+   :align: center
+   :alt: loss parallel
+
+   Figure 2. Cross-entropy loss forward computation with loss parallel on one GPU. Blue represents sharded tensors; green represents replicated tensors; yellow represents tensors with partial values (to be all-reduced). Black arrows are local computations; red arrows are functional collectives among GPUs.
+
+In the PyTorch Tensor Parallel API, Loss Parallel can be enabled via a context manager ``loss_parallel``, with which one can directly use ``torch.nn.functional.cross_entropy`` or ``torch.nn.CrossEntropyLoss`` without modifying other parts of their code.
+
+To apply Loss Parallel, the model predictions, usually of the shape ``[batch size, sequence length, vocabulary size]``, should be sharded on the vocabulary dimension. This can be easily done via marking the output layouts of the last linear projection layer output:
+
+.. code-block:: python
+
+    model = parallelize_module(
+        model,
+        tp_mesh,
+        {
+            "tok_embeddings": RowwiseParallel(
+                input_layouts=Replicate(),
+                output_layouts=Shard(1),
+            ),
+            "norm": SequenceParallel(),
+            "output": ColwiseParallel(
+                input_layouts=Shard(1),
+                # use DTensor as the output
+                use_local_output=False,
+            ),
+        },
+    )
+
+In the code above, we also apply Sequence Parallel to the norm layer before output. We apply ``use_local_output=False`` to let the output stay as a DTensor, to work with the ``loss_parallel`` context manager. After that, one can simply call the cross_entropy loss function as is shown below. Note that the backward computation also needs to happen within the context.
+
+.. code-block:: python
+
+    import torch.nn.functional as F
+    from torch.distributed.tensor.parallel import loss_parallel
+
+    pred = model(input_ids)
+    with loss_parallel():
+        # assuming pred and labels are of the shape [batch, seq, vocab]
+        loss = F.cross_entropy(pred.flatten(0, 1), labels.flatten(0, 1))
+        loss.backward()
+
+
+Combine Tensor Parallel with Fully Sharded Data Parallel together
+-----------------------------------------------------------------
+
+
+Now that we have shown how to apply Tensor/Sequence Parallel to the model, let us also take a look at how Tensor Parallel and Fully Sharded Data Parallel could work together.
+Since Tensor Parallelism incurs communications that block the computation, we want to make sure it runs within a fast communication channel, such as NVLink.
+In practice, we usually apply Tensor Parallel within each host, and apply Fully Sharded Data Parallel across the hosts.
+
+.. figure:: /_static/img/distributed/fsdp_tp.png
+   :width: 100%
+   :align: center
+   :alt: fsdp + tp
+
+   Figure 3. FSDP and TP work on separate device dimensions, FSDP communication happens inter-host and TP communication happens intra-host.
+
+
+This 2-D parallelism pattern can be easily expressed via a 2-D DeviceMesh, and we just need pass each “sub” DeviceMesh to each individual parallelism APIs:
+
+.. code-block:: python
+
+    from torch.distributed.device_mesh import init_device_mesh
+    from torch.distributed.tensor.parallel import ColwiseParallel, RowwiseParallel, parallelize_module
+    from torch.distributed.fsdp import fully_shard
+
+    # i.e. 2-D mesh is [dp, tp], training on 64 GPUs that performs 8 way DP and 8 way TP
+    mesh_2d = init_device_mesh("cuda", (8, 8))
+    tp_mesh = mesh_2d["tp"] # a submesh that connects intra-host devices
+    dp_mesh = mesh_2d["dp"] # a submesh that connects inter-host devices
+
+    model = Model(...)
+
+    tp_plan = {...}
+
+    # apply Tensor Parallel intra-host on tp_mesh
+    model_tp = parallelize_module(model, tp_mesh, tp_plan)
+    # apply FSDP inter-host on dp_mesh
+    model_2d = fully_shard(model_tp, mesh=dp_mesh, ...)
+
+
+This would allow us to easily apply Tensor Parallel within each host (intra-host) and apply FSDP across hosts (inter-hosts), with **0-code changes** to the Llama model.
+The Tensor(Model) Parallel and Data Parallel techniques combined together provides the ability to continue increasing model size and training efficiently using a large number of GPUs.
+
+Conclusion
+----------
+This tutorial demonstrates how to train a large Transformer-like model across hundreds to thousands of GPUs using Tensor Parallel in combination with Fully Sharded Data Parallel.
+It explains how to apply Tensor Parallel to different parts of the model, with **no code changes** to the model itself. Tensor Parallel is a efficient model parallelism technique for large scale training.
+
+To see the complete end-to-end code example explained in this tutorial, please refer to the `Tensor Parallel examples <https://github.com/pytorch/examples/blob/main/distributed/tensor_parallelism/fsdp_tp_example.py>`__ in the pytorch/examples repository.
diff --git a/intermediate_source/_torch_export_nightly_tutorial.py b/intermediate_source/_torch_export_nightly_tutorial.py
new file mode 100644
index 00000000000..fdbe18392e5
--- /dev/null
+++ b/intermediate_source/_torch_export_nightly_tutorial.py
@@ -0,0 +1,635 @@
+# -*- coding: utf-8 -*-
+
+"""
+torch.export Nightly Tutorial
+================
+**Author:** William Wen, Zhengxu Chen, Angela Yi
+"""
+
+######################################################################
+#
+# .. warning::
+#
+#     ``torch.export`` and its related features are in prototype status and are subject to backwards compatibility
+#     breaking changes. This tutorial provides a snapshot of ``torch.export`` usage as of PyTorch 2.1.
+#
+# :func:`torch.export` is the PyTorch 2.X way to export PyTorch models into
+# standardized model representations, intended
+# to be run on different (i.e. Python-less) environments.
+#
+# In this tutorial, you will learn how to use :func:`torch.export` to extract
+# ``ExportedProgram``'s (i.e. single-graph representations) from PyTorch programs.
+# We also detail some considerations/modifications that you may need
+# to make in order to make your model compatible with ``torch.export``.
+#
+# **Contents**
+#
+# .. contents::
+#     :local:
+
+######################################################################
+# Basic Usage
+# -----------
+#
+# ``torch.export`` extracts single-graph representations from PyTorch programs
+# by tracing the target function, given example inputs.
+# ``torch.export.export()`` is the main entry point for ``torch.export``.
+#
+# In this tutorial, ``torch.export`` and ``torch.export.export()`` are practically synonymous,
+# though ``torch.export`` generally refers to the PyTorch 2.X export process, and ``torch.export.export()``
+# generally refers to the actual function call.
+#
+# The signature of ``torch.export.export()`` is:
+#
+# .. code:: python
+#
+#     export(
+#         f: Callable,
+#         args: Tuple[Any, ...],
+#         kwargs: Optional[Dict[str, Any]] = None,
+#         *,
+#         dynamic_shapes: Optional[Dict[str, Dict[int, Dim]]] = None
+#     ) -> ExportedProgram
+#
+# ``torch.export.export()`` traces the tensor computation graph from calling ``f(*args, **kwargs)``
+# and wraps it in an ``ExportedProgram``, which can be serialized or executed later with
+# different inputs. Note that while the output ``ExportedGraph`` is callable and can be
+# called in the same way as the original input callable, it is not a ``torch.nn.Module``.
+# We will detail the ``dynamic_shapes`` argument later in the tutorial.
+
+import torch
+from torch.export import export
+
+class MyModule(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.lin = torch.nn.Linear(100, 10)
+
+    def forward(self, x, y):
+        return torch.nn.functional.relu(self.lin(x + y), inplace=True)
+
+mod = MyModule()
+exported_mod = export(mod, (torch.randn(8, 100), torch.randn(8, 100)))
+print(type(exported_mod))
+print(exported_mod(torch.randn(8, 100), torch.randn(8, 100)))
+
+######################################################################
+# Let's review some attributes of ``ExportedProgram`` that are of interest.
+#
+# The ``graph`` attribute is an `FX graph <https://pytorch.org/docs/stable/fx.html#torch.fx.Graph>`__
+# traced from the function we exported, that is, the computation graph of all PyTorch operations.
+# The FX graph has some important properties:
+#
+# - The operations are "ATen-level" operations.
+# - The graph is "functionalized", meaning that no operations are mutations.
+#
+# The ``graph_module`` attribute is the ``GraphModule`` that wraps the ``graph`` attribute
+# so that it can be ran as a ``torch.nn.Module``.
+
+print(exported_mod)
+print(exported_mod.graph_module)
+
+######################################################################
+# The printed code shows that FX graph only contains ATen-level ops (such as ``torch.ops.aten``)
+# and that mutations were removed. For example, the mutating op ``torch.nn.functional.relu(..., inplace=True)``
+# is represented in the printed code by ``torch.ops.aten.relu.default``, which does not mutate.
+# Future uses of input to the original mutating ``relu`` op are replaced by the additional new output
+# of the replacement non-mutating ``relu`` op.
+#
+# Other attributes of interest in ``ExportedProgram`` include:
+#
+# - ``graph_signature`` -- the inputs, outputs, parameters, buffers, etc. of the exported graph.
+# - ``range_constraints`` and ``equality_constraints`` -- constraints, covered later
+
+print(exported_mod.graph_signature)
+
+######################################################################
+# See the ``torch.export`` `documentation <https://pytorch.org/docs/main/export.html#torch.export.export>`__
+# for more details.
+
+######################################################################
+# Graph Breaks
+# ------------
+#
+# Although ``torch.export`` shares components with ``torch.compile``,
+# the key limitation of ``torch.export``, especially when compared to ``torch.compile``, is that it does not
+# support graph breaks. This is because handling graph breaks involves interpreting
+# the unsupported operation with default Python evaluation, which is incompatible
+# with the export use case. Therefore, in order to make your model code compatible
+# with ``torch.export``, you will need to modify your code to remove graph breaks.
+#
+# A graph break is necessary in cases such as:
+#
+# - data-dependent control flow
+
+def bad1(x):
+    if x.sum() > 0:
+        return torch.sin(x)
+    return torch.cos(x)
+
+import traceback as tb
+try:
+    export(bad1, (torch.randn(3, 3),))
+except Exception:
+    tb.print_exc()
+
+######################################################################
+# - accessing tensor data with ``.data``
+
+def bad2(x):
+    x.data[0, 0] = 3
+    return x
+
+try:
+    export(bad2, (torch.randn(3, 3),))
+except Exception:
+    tb.print_exc()
+
+######################################################################
+# - calling unsupported functions (such as many built-in functions)
+
+def bad3(x):
+    x = x + 1
+    return x + id(x)
+
+try:
+    export(bad3, (torch.randn(3, 3),))
+except Exception:
+    tb.print_exc()
+
+######################################################################
+# - unsupported Python language features (e.g. throwing exceptions, match statements)
+
+def bad4(x):
+    try:
+        x = x + 1
+        raise RuntimeError("bad")
+    except:
+        x = x + 2
+    return x
+
+try:
+    export(bad4, (torch.randn(3, 3),))
+except Exception:
+    tb.print_exc()
+
+######################################################################
+# The sections below demonstrate some ways you can modify your code
+# in order to remove graph breaks.
+
+######################################################################
+# Control Flow Ops
+# ----------------
+#
+# ``torch.export`` actually does support data-dependent control flow.
+# But these need to be expressed using control flow ops. For example,
+# we can fix the control flow example above using the ``cond`` op, like so:
+
+from functorch.experimental.control_flow import cond
+
+def bad1_fixed(x):
+    def true_fn(x):
+        return torch.sin(x)
+    def false_fn(x):
+        return torch.cos(x)
+    return cond(x.sum() > 0, true_fn, false_fn, [x])
+
+exported_bad1_fixed = export(bad1_fixed, (torch.randn(3, 3),))
+print(exported_bad1_fixed(torch.ones(3, 3)))
+print(exported_bad1_fixed(-torch.ones(3, 3)))
+
+######################################################################
+# There are limitations to ``cond`` that one should be aware of:
+#
+# - The predicate (i.e. ``x.sum() > 0``) must result in a boolean or a single-element tensor.
+# - The operands (i.e. ``[x]``) must be tensors.
+# - The branch function (i.e. ``true_fn`` and ``false_fn``) signature must match with the
+#   operands and they must both return a single tensor with the same metadata (for example, ``dtype``, ``shape``, etc.).
+# - Branch functions cannot mutate input or global variables.
+# - Branch functions cannot access closure variables, except for ``self`` if the function is
+#   defined in the scope of a method.
+#
+# For more details about ``cond``, check out the `documentation <https://pytorch.org/docs/main/cond.html>`__.
+
+######################################################################
+# ..
+#     [NOTE] map is not documented at the moment
+#     We can also use ``map``, which applies a function across the first dimension
+#     of the first tensor argument.
+#
+#     from functorch.experimental.control_flow import map
+#
+#     def map_example(xs):
+#         def map_fn(x, const):
+#             def true_fn(x):
+#                 return x + const
+#             def false_fn(x):
+#                 return x - const
+#             return control_flow.cond(x.sum() > 0, true_fn, false_fn, [x])
+#         return control_flow.map(map_fn, xs, torch.tensor([2.0]))
+#
+#     exported_map_example= export(map_example, (torch.randn(4, 3),))
+#     inp = torch.cat((torch.ones(2, 3), -torch.ones(2, 3)))
+#     print(exported_map_example(inp))
+
+######################################################################
+# Constraints/Dynamic Shapes
+# --------------------------
+#
+# Ops can have different specializations/behaviors for different tensor shapes, so by default,
+# ``torch.export`` requires inputs to ``ExportedProgram`` to have the same shape as the respective
+# example inputs given to the initial ``torch.export.export()`` call.
+# If we try to run the ``ExportedProgram`` in the example below with a tensor
+# with a different shape, we get an error:
+
+class MyModule2(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.lin = torch.nn.Linear(100, 10)
+
+    def forward(self, x, y):
+        return torch.nn.functional.relu(self.lin(x + y), inplace=True)
+
+mod2 = MyModule2()
+exported_mod2 = export(mod2, (torch.randn(8, 100), torch.randn(8, 100)))
+
+try:
+    exported_mod2(torch.randn(10, 100), torch.randn(10, 100))
+except Exception:
+    tb.print_exc()
+
+######################################################################
+# We can relax this constraint using the ``dynamic_shapes`` argument of
+# ``torch.export.export()``, which allows us to specify, using ``torch.export.Dim``
+# (`documentation <https://pytorch.org/docs/main/export.html#torch.export.Dim>`__),
+# which dimensions of the input tensors are dynamic.
+#
+# For each tensor argument of the input callable, we can specify a mapping from the dimension
+# to a ``torch.export.Dim``.
+# A ``torch.export.Dim`` is essentially a named symbolic integer with optional
+# minimum and maximum bounds.
+#
+# Then, the format of ``torch.export.export()``'s ``dynamic_shapes`` argument is a mapping
+# from the input callable's tensor argument names, to dimension --> dim mappings as described above.
+# If there is no ``torch.export.Dim`` given to a tensor argument's dimension, then that dimension is
+# assumed to be static.
+#
+# The first argument of ``torch.export.Dim`` is the name for the symbolic integer, used for debugging.
+# Then we can specify an optional minimum and maximum bound (inclusive). Below, we show example usage.
+#
+# In the example below, our input
+# ``inp1`` has an unconstrained first dimension, but the size of the second
+# dimension must be in the interval [4, 18].
+
+from torch.export import Dim
+
+inp1 = torch.randn(10, 10, 2)
+
+def dynamic_shapes_example1(x):
+    x = x[:, 2:]
+    return torch.relu(x)
+
+inp1_dim0 = Dim("inp1_dim0")
+inp1_dim1 = Dim("inp1_dim1", min=4, max=18)
+dynamic_shapes1 = {
+    "x": {0: inp1_dim0, 1: inp1_dim1},
+}
+
+exported_dynamic_shapes_example1 = export(dynamic_shapes_example1, (inp1,), dynamic_shapes=dynamic_shapes1)
+
+print(exported_dynamic_shapes_example1(torch.randn(5, 5, 2)))
+
+try:
+    exported_dynamic_shapes_example1(torch.randn(8, 1, 2))
+except Exception:
+    tb.print_exc()
+
+try:
+    exported_dynamic_shapes_example1(torch.randn(8, 20, 2))
+except Exception:
+    tb.print_exc()
+
+try:
+    exported_dynamic_shapes_example1(torch.randn(8, 8, 3))
+except Exception:
+    tb.print_exc()
+
+######################################################################
+# Note that if our example inputs to ``torch.export`` do not satisfy the constraints
+# given by ``dynamic_shapes``, then we get an error.
+
+inp1_dim1_bad = Dim("inp1_dim1_bad", min=11, max=18)
+dynamic_shapes1_bad = {
+    "x": {0: inp1_dim0, 1: inp1_dim1_bad},
+}
+
+try:
+    export(dynamic_shapes_example1, (inp1,), dynamic_shapes=dynamic_shapes1_bad)
+except Exception:
+    tb.print_exc()
+
+######################################################################
+# We can enforce that equalities between dimensions of different tensors
+# by using the same ``torch.export.Dim`` object, for example, in matrix multiplication:
+
+inp2 = torch.randn(4, 8)
+inp3 = torch.randn(8, 2)
+
+def dynamic_shapes_example2(x, y):
+    return x @ y
+
+inp2_dim0 = Dim("inp2_dim0")
+inner_dim = Dim("inner_dim")
+inp3_dim1 = Dim("inp3_dim1")
+
+dynamic_shapes2 = {
+    "x": {0: inp2_dim0, 1: inner_dim},
+    "y": {0: inner_dim, 1: inp3_dim1},
+}
+
+exported_dynamic_shapes_example2 = export(dynamic_shapes_example2, (inp2, inp3), dynamic_shapes=dynamic_shapes2)
+
+print(exported_dynamic_shapes_example2(torch.randn(2, 16), torch.randn(16, 4)))
+
+try:
+    exported_dynamic_shapes_example2(torch.randn(4, 8), torch.randn(4, 2))
+except Exception:
+    tb.print_exc()
+
+######################################################################
+# We can actually use ``torch.export`` to guide us as to which ``dynamic_shapes`` constraints
+# are necessary. We can do this by relaxing all constraints (recall that if we
+# do not provide constraints for a dimension, the default behavior is to constrain
+# to the exact shape value of the example input) and letting ``torch.export``
+# error out.
+
+inp4 = torch.randn(8, 16)
+inp5 = torch.randn(16, 32)
+
+def dynamic_shapes_example3(x, y):
+    if x.shape[0] <= 16:
+        return x @ y[:, :16]
+    return y
+
+dynamic_shapes3 = {
+    "x": {i: Dim(f"inp4_dim{i}") for i in range(inp4.dim())},
+    "y": {i: Dim(f"inp5_dim{i}") for i in range(inp5.dim())},
+}
+
+try:
+    export(dynamic_shapes_example3, (inp4, inp5), dynamic_shapes=dynamic_shapes3)
+except Exception:
+    tb.print_exc()
+
+######################################################################
+# We can see that the error message gives us suggested fixes to our
+# dynamic shape constraints. Let us follow those suggestions (exact
+# suggestions may differ slightly):
+
+def suggested_fixes():
+    inp4_dim1 = Dim('shared_dim')
+    # suggested fixes below
+    inp4_dim0 = Dim('inp4_dim0', max=16)
+    inp5_dim1 = Dim('inp5_dim1', min=17)
+    inp5_dim0 = inp4_dim1
+    # end of suggested fixes
+    return {
+        "x": {0: inp4_dim0, 1: inp4_dim1},
+        "y": {0: inp5_dim0, 1: inp5_dim1},
+    }
+
+dynamic_shapes3_fixed = suggested_fixes()
+exported_dynamic_shapes_example3 = export(dynamic_shapes_example3, (inp4, inp5), dynamic_shapes=dynamic_shapes3_fixed)
+print(exported_dynamic_shapes_example3(torch.randn(4, 32), torch.randn(32, 64)))
+
+######################################################################
+# Note that in the example above, because we constrained the value of ``x.shape[0]`` in
+# ``dynamic_shapes_example3``, the exported program is sound even though there is a
+# raw ``if`` statement.
+#
+# If you want to see why ``torch.export`` generated these constraints, you can
+# re-run the script with the environment variable ``TORCH_LOGS=dynamic,dynamo``,
+# or use ``torch._logging.set_logs``.
+
+import logging
+torch._logging.set_logs(dynamic=logging.INFO, dynamo=logging.INFO)
+exported_dynamic_shapes_example3 = export(dynamic_shapes_example3, (inp4, inp5), dynamic_shapes=dynamic_shapes3_fixed)
+
+# reset to previous values
+torch._logging.set_logs(dynamic=logging.WARNING, dynamo=logging.WARNING)
+
+######################################################################
+# We can view an ``ExportedProgram``'s constraints using the ``range_constraints`` and
+# ``equality_constraints`` attributes. The logging above reveals what the symbols ``s0, s1, ...``
+# represent.
+
+print(exported_dynamic_shapes_example3.range_constraints)
+print(exported_dynamic_shapes_example3.equality_constraints)
+
+######################################################################
+# Custom Ops
+# ----------
+#
+# ``torch.export`` can export PyTorch programs with custom operators.
+#
+#
+# Currently, the steps to register a custom op for use by ``torch.export`` are:
+#
+# - If you’re writing custom ops purely in Python, use torch.library.custom_op.
+
+import torch.library
+import numpy as np
+
+@torch.library.custom_op("mylib::sin", mutates_args=())
+def sin(x):
+    x_np = x.numpy()
+    y_np = np.sin(x_np)
+    return torch.from_numpy(y_np)
+
+######################################################################
+# - You will need to provide abstract implementation so that PT2 can trace through it.
+
+@torch.library.register_fake("mylib::sin")
+def _(x):
+    return torch.empty_like(x)
+
+# - Sometimes, the custom op you are exporting has data-dependent output, meaning
+# we can't determine the shape of the output at compile time. In this case, you can do
+# following:
+@torch.library.custom_op("mylib::nonzero", mutates_args=())
+def nonzero(x):
+    x_np = x.cpu().numpy()
+    res = np.stack(np.nonzero(x_np), axis=1)
+    return torch.tensor(res, device=x.device)
+
+@torch.library.register_fake("mylib::nonzero")
+def _(x):
+    # The number of nonzero-elements is data-dependent.
+    # Since we cannot peek at the data in an abstract implementation,
+    # we use the `ctx` object to construct a new ``symint`` that
+    # represents the data-dependent size.
+    ctx = torch.library.get_ctx()
+    nnz = ctx.new_dynamic_size()
+    shape = [nnz, x.dim()]
+    result = x.new_empty(shape, dtype=torch.int64)
+    return result
+
+######################################################################
+# - Call the custom op from the code you want to export using ``torch.ops``
+
+def custom_op_example(x):
+    x = torch.sin(x)
+    x = torch.ops.mylib.sin(x)
+    x = torch.cos(x)
+    y = torch.ops.mylib.nonzero(x)
+    return x + y.sum()
+
+######################################################################
+# - Export the code as before
+
+exported_custom_op_example = export(custom_op_example, (torch.randn(3, 3),))
+exported_custom_op_example.graph_module.print_readable()
+print(exported_custom_op_example(torch.randn(3, 3)))
+
+######################################################################
+# Note in the above outputs that the custom op is included in the exported graph.
+# And when we call the exported graph as a function, the original custom op is called,
+# as evidenced by the ``print`` call.
+#
+# If you have a custom operator implemented in C++, please refer to
+# `this document <https://docs.google.com/document/d/1_W62p8WJOQQUzPsJYa7s701JXt0qf2OfLub2sbkHOaU/edit#heading=h.ahugy69p2jmz>`__
+# to make it compatible with ``torch.export``.
+
+######################################################################
+# Decompositions
+# --------------
+#
+# The graph produced by ``torch.export`` by default returns a graph containing
+# only functional ATen operators. This functional ATen operator set (or "opset") contains around 2000
+# operators, all of which are functional, that is, they do not
+# mutate or alias inputs.  You can find a list of all ATen operators
+# `here <https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/native_functions.yaml>`__
+# and you can inspect if an operator is functional by checking
+# ``op._schema.is_mutable``, for example:
+
+print(torch.ops.aten.add.Tensor._schema.is_mutable)
+print(torch.ops.aten.add_.Tensor._schema.is_mutable)
+
+######################################################################
+# By default, the environment in which you want to run the exported graph
+# should support all ~2000 of these operators.
+# However, you can use the following API on the exported program
+# if your specific environment is only able to support a subset of
+# the ~2000 operators.
+#
+# .. code:: python
+#
+#     def run_decompositions(
+#         self: ExportedProgram,
+#         decomposition_table: Optional[Dict[torch._ops.OperatorBase, Callable]]
+#     ) -> ExportedProgram
+#
+# ``run_decompositions`` takes in a decomposition table, which is a mapping of
+# operators to a function specifying how to reduce, or decompose, that operator
+# into an equivalent sequence of other ATen operators.
+#
+# The default decomposition table for ``run_decompositions`` is the
+# `Core ATen decomposition table <https://github.com/pytorch/pytorch/blob/b460c3089367f3fadd40aa2cb3808ee370aa61e1/torch/_decomp/__init__.py#L252>`__
+# which will decompose the all ATen operators to the
+# `Core ATen Operator Set <https://pytorch.org/docs/main/torch.compiler_ir.html#core-aten-ir>`__
+# which consists of only ~180 operators.
+
+class M(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.linear = torch.nn.Linear(3, 4)
+
+    def forward(self, x):
+        return self.linear(x)
+
+ep = export(M(), (torch.randn(2, 3),))
+print(ep.graph)
+
+core_ir_ep = ep.run_decompositions()
+print(core_ir_ep.graph)
+
+######################################################################
+# Notice that after running ``run_decompositions`` the
+# ``torch.ops.aten.t.default`` operator, which is not part of the Core ATen
+# Opset, has been replaced with ``torch.ops.aten.permute.default`` which is part
+# of the Core ATen Opset.
+
+######################################################################
+# Most ATen operators already have decompositions, which are located
+# `here <https://github.com/pytorch/pytorch/blob/b460c3089367f3fadd40aa2cb3808ee370aa61e1/torch/_decomp/decompositions.py>`__.
+# If you would like to use some of these existing decomposition functions,
+# you can pass in a list of operators you would like to decompose to the
+# `get_decompositions <https://github.com/pytorch/pytorch/blob/b460c3089367f3fadd40aa2cb3808ee370aa61e1/torch/_decomp/__init__.py#L191>`__
+# function, which will return a decomposition table using existing
+# decomposition implementations.
+
+class M(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.linear = torch.nn.Linear(3, 4)
+
+    def forward(self, x):
+        return self.linear(x)
+
+ep = export(M(), (torch.randn(2, 3),))
+print(ep.graph)
+
+from torch._decomp import get_decompositions
+decomp_table = get_decompositions([torch.ops.aten.t.default, torch.ops.aten.transpose.int])
+core_ir_ep = ep.run_decompositions(decomp_table)
+print(core_ir_ep.graph)
+
+######################################################################
+# If there is no existing decomposition function for an ATen operator that you would
+# like to decompose, feel free to send a pull request into PyTorch
+# implementing the decomposition!
+
+######################################################################
+# ExportDB
+# --------
+#
+# ``torch.export`` will only ever export a single computation graph from a PyTorch program. Because of this requirement,
+# there will be Python or PyTorch features that are not compatible with ``torch.export``, which will require users to
+# rewrite parts of their model code. We have seen examples of this earlier in the tutorial -- for example, rewriting
+# if-statements using ``cond``.
+#
+# `ExportDB <https://pytorch.org/docs/main/generated/exportdb/index.html>`__ is the standard reference that documents
+# supported and unsupported Python/PyTorch features for ``torch.export``. It is essentially a list a program samples, each
+# of which represents the usage of one particular Python/PyTorch feature and its interaction with ``torch.export``.
+# Examples are also tagged by category so that they can be more easily searched.
+#
+# For example, let's use ExportDB to get a better understanding of how the predicate works in the ``cond`` operator.
+# We can look at the example called ``cond_predicate``, which has a ``torch.cond`` tag. The example code looks like:
+
+def cond_predicate(x):
+    """
+    The conditional statement (aka predicate) passed to ``cond()`` must be one of the following:
+      - torch.Tensor with a single element
+      - boolean expression
+    NOTE: If the `pred` is test on a dim with batch size < 2, it will be specialized.
+    """
+    pred = x.dim() > 2 and x.shape[2] > 10
+    return cond(pred, lambda x: x.cos(), lambda y: y.sin(), [x])
+
+######################################################################
+# More generally, ExportDB can be used as a reference when one of the following occurs:
+#
+# 1. Before attempting ``torch.export``, you know ahead of time that your model uses some tricky Python/PyTorch features
+#    and you want to know if ``torch.export`` covers that feature.
+# 2. When attempting ``torch.export``, there is a failure and it's unclear how to work around it.
+#
+# ExportDB is not exhaustive, but is intended to cover all use cases found in typical PyTorch code. Feel free to reach
+# out if there is an important Python/PyTorch feature that should be added to ExportDB or supported by ``torch.export``.
+
+######################################################################
+# Conclusion
+# ----------
+#
+# We introduced ``torch.export``, the new PyTorch 2.X way to export single computation
+# graphs from PyTorch programs. In particular, we demonstrate several code modifications
+# and considerations (control flow ops, constraints, etc.) that need to be made in order to export a graph.
diff --git a/intermediate_source/autograd_saved_tensors_hooks_tutorial.py b/intermediate_source/autograd_saved_tensors_hooks_tutorial.py
new file mode 100644
index 00000000000..ed581426c2e
--- /dev/null
+++ b/intermediate_source/autograd_saved_tensors_hooks_tutorial.py
@@ -0,0 +1,514 @@
+"""
+Hooks for autograd saved tensors
+================================
+
+"""
+
+
+######################################################################
+# PyTorch typically computes gradients using backpropagation. However,
+# certain operations require intermediary results to be saved in order to
+# perform backpropagation. This tutorial walks through how these tensors
+# are saved/retrieved and how you can define hooks to control the
+# packing/unpacking process.
+#
+# This tutorial assumes you are familiar with how backpropagation works in
+# theory. If not, read `this <https://colab.research.google.com/drive/1aWNdmYt7RcHMbUk-Xz2Cv5-cGFSWPXe0#scrollTo=AHcEJ6nXUb7W>`_ first.
+#
+
+
+######################################################################
+# Saved tensors
+# -------------
+#
+
+
+######################################################################
+# Training a model usually consumes more memory than running it for
+# inference. Broadly speaking, one can say that it is because “PyTorch
+# needs to save the computation graph, which is needed to call
+# ``backward``”, hence the additional memory usage. One goal of this
+# tutorial is to finetune this understanding.
+#
+# In fact, the graph in itself sometimes does not consume much more memory
+# as it never copies any tensors. However, the graph can keep *references*
+# to tensors that would otherwise have gone out of scope: those are
+# referred to as **saved tensors**.
+#
+
+
+######################################################################
+# Why does training a model (typically) requires more memory than evaluating it?
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+
+
+######################################################################
+# We start with a simple example: :math:`y = a \cdot b` , for which
+# we know the gradients of :math:`y` with respect to :math:`a` and
+# :math:`b`:
+#
+# .. math::  \frac{\partial y}{\partial a} = b
+#
+# .. math::  \frac{\partial y}{\partial b} = a
+#
+
+import torch
+
+a = torch.randn(5, requires_grad=True)
+b = torch.ones(5, requires_grad=True)
+y = a * b
+
+#################################################################
+# Using a torchviz, we can visualize the computation graph
+#
+#  .. figure:: https://user-images.githubusercontent.com/8019486/130124513-72e016a3-c36f-42b9-88e2-53baf3e016c5.png
+#    :width: 300
+#    :align: center
+
+
+######################################################################
+# In this example, PyTorch saves intermediary values :math:`a` and
+# :math:`b` in order to compute the gradient during the backward.
+#
+#  .. figure:: https://user-images.githubusercontent.com/8019486/130124538-3da50977-6f0b-46d0-8909-5456ade9b598.png
+#    :width: 300
+#    :align: center
+
+
+######################################################################
+# Those intermediary values (in orange above) can be accessed (for
+# debugging purposes) by looking for attributes of the ``grad_fn`` of
+# ``y`` which start with the prefix ``_saved``:
+#
+
+print(y.grad_fn._saved_self)
+print(y.grad_fn._saved_other)
+
+
+######################################################################
+# As the computation graph grows in depth, it will store more *saved
+# tensors*. Meanwhile, those tensors would have gone out of scope if not
+# for the graph.
+#
+
+def f(x):
+    return x * x
+
+x = torch.randn(5, requires_grad=True)
+y = f(f(f(x)))
+
+######################################################################
+#  .. figure:: https://user-images.githubusercontent.com/8019486/130124570-f1074098-1bb3-459e-bf5a-03bf6f65b403.png
+#    :width: 500
+#    :align: center
+
+
+######################################################################
+# In the example above, executing without grad would only have kept ``x``
+# and ``y`` in the scope, But the graph additionally stores ``f(x)`` and
+# ``f(f(x))``. Hence, running a forward pass during training will be more
+# costly in memory usage than during evaluation (more precisely, when
+# autograd is not required).
+#
+
+
+######################################################################
+# The concept of packing / unpacking
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+
+
+######################################################################
+# Going back to the first example: ``y.grad_fn._saved_self`` and
+# ``y.grad_fn._saved_other`` point to the original tensor object,
+# respectively ``a`` and ``b``.
+#
+
+a = torch.randn(5, requires_grad=True)
+b = torch.ones(5, requires_grad=True)
+y = a * b
+
+print(y.grad_fn._saved_self is a)   # True
+print(y.grad_fn._saved_other is b)  # True
+
+
+######################################################################
+# However, that may not always be the case.
+#
+
+a = torch.randn(5, requires_grad=True)
+y = torch.exp(a)
+print(y.grad_fn._saved_result.equal(y))  # True
+print(y.grad_fn._saved_result is y)      # False
+
+
+######################################################################
+# Under the hood, PyTorch has **packed** and **unpacked** the tensor
+# ``y`` to prevent reference cycles.
+#
+# As a rule of thumb, you should *not* rely on the fact that accessing
+# the tensor saved for backward will yield the same tensor object as the
+# original tensor. They will however share the same *storage*.
+#
+
+
+######################################################################
+# Saved tensors hooks
+# -------------------
+#
+
+
+######################################################################
+# PyTorch provides an API to control how saved tensors should be packed /
+# unpacked.
+#
+
+def pack_hook(x):
+    print("Packing", x)
+    return x
+
+def unpack_hook(x):
+    print("Unpacking", x)
+    return x
+a = torch.ones(5, requires_grad=True)
+b = torch.ones(5, requires_grad=True) * 2
+
+with torch.autograd.graph.saved_tensors_hooks(pack_hook, unpack_hook):
+    y = a * b
+
+y.sum().backward()
+
+
+######################################################################
+# The ``pack_hook`` function will be called every time an operation saves
+# a tensor for backward.
+# The output of ``pack_hook`` is then stored in the computation graph
+# instead of the original tensor.
+# The ``unpack_hook`` uses that return value to compute a new tensor,
+# which is the one actually used during the backward pass.
+# In general, you want ``unpack_hook(pack_hook(t))`` to be equal to
+# ``t``.
+#
+
+x = torch.randn(5, requires_grad=True)
+with torch.autograd.graph.saved_tensors_hooks(lambda x: x * 4, lambda x: x / 4):
+    y = torch.pow(x, 2)
+y.sum().backward()
+assert(x.grad.equal(2 * x))
+
+
+######################################################################
+# One thing to note is that the output of ``pack_hook`` can be *any Python
+# object*, as long as ``unpack_hook`` can derive a tensor with the correct
+# value from it.
+#
+
+
+######################################################################
+# Some unconventional examples
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+
+
+######################################################################
+# First, some silly examples to illustrate what is possible but you
+# probably don’t ever want to do it.
+#
+
+######################################################################
+# Returning an ``int``
+# ^^^^^^^^^^^^^^^^^^^^
+#
+# Returning the index of a Python list
+# Relatively harmless but with debatable usefulness
+
+storage = []
+
+def pack(x):
+    storage.append(x)
+    return len(storage) - 1
+
+def unpack(x):
+    return storage[x]
+
+x = torch.randn(5, requires_grad=True)
+with torch.autograd.graph.saved_tensors_hooks(pack, unpack):
+    y = x * x
+y.sum().backward()
+
+assert(x.grad.equal(2 * x))
+
+######################################################################
+# Returning a tuple
+# ^^^^^^^^^^^^^^^^^
+#
+# Returning some tensor and a function how to unpack it
+# Quite unlikely to be useful in its current form
+
+def pack(x):
+    delta = torch.randn(*x.size())
+    return x - delta, lambda x: x + delta
+
+def unpack(packed):
+    x, f = packed
+    return f(x)
+
+
+x = torch.randn(5, requires_grad=True)
+with torch.autograd.graph.saved_tensors_hooks(pack, unpack):
+    y = x * x
+y.sum().backward()
+
+assert(torch.allclose(x.grad, 2 * x))
+
+######################################################################
+# Returning a ``str``
+# ^^^^^^^^^^^^^^^^^^^
+#
+# Returning the ``__repr__ of`` the tensor
+# Probably never do this
+
+x = torch.randn(5, requires_grad=True)
+with torch.autograd.graph.saved_tensors_hooks(lambda x: repr(x), lambda x: eval("torch." + x)):
+    y = x * x
+y.sum().backward()
+assert(torch.all(x.grad - 2 * x <= 1e-4))
+
+
+######################################################################
+# Although those examples will not be useful in practice, they
+# illustrate that the output of ``pack_hook`` can really be any Python
+# object as long as it contains enough information to retrieve the
+# content of the original tensor.
+# In the next sections, we focus on more useful applications.
+#
+
+
+######################################################################
+# Saving tensors to CPU
+# ~~~~~~~~~~~~~~~~~~~~~
+#
+
+
+######################################################################
+# Very often, the tensors involved in the computation graph live on GPU.
+# Keeping a reference to those tensors in the graph is what causes most
+# models to run out of GPU memory during training while they would have
+# done fine during evaluation.
+#
+# Hooks provide a very simple way to implement that.
+#
+
+def pack_hook(x):
+    return (x.device, x.cpu())
+
+def unpack_hook(packed):
+    device, tensor = packed
+    return tensor.to(device)
+
+x = torch.randn(5, requires_grad=True)
+with torch.autograd.graph.saved_tensors_hooks(pack, unpack):
+    y = x * x
+y.sum().backward()
+
+torch.allclose(x.grad, (2 * x))
+
+
+######################################################################
+# In fact, PyTorch provides an API to conveniently use those hooks (as
+# well as the ability to use pinned memory).
+#
+
+import torch.nn as nn
+
+class Model(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.w = nn.Parameter(torch.randn(5))
+
+    def forward(self, x):
+        with torch.autograd.graph.save_on_cpu(pin_memory=True):
+            # some computation
+            return self.w * x
+
+x = torch.randn(5)
+model = Model()
+loss = model(x).sum()
+loss.backward()
+
+
+######################################################################
+# In practice, on a A100 GPU, for a ResNet-152 with batch size 256, this
+# corresponds to a GPU memory usage reduction from 48GB to 5GB, at the
+# cost of a 6x slowdown.
+#
+# Of course, you can modulate the tradeoff by only saving to CPU certain
+# parts of the network.
+#
+# For instance, you could define a special ``nn.Module`` that wraps any
+# module and saves its tensors to CPU.
+#
+
+class SaveToCpu(nn.Module):
+    def __init__(self, module):
+        super().__init__()
+        self.module = module
+
+    def forward(self, *args, **kwargs):
+        with torch.autograd.graph.save_on_cpu(pin_memory=True):
+            return self.module(*args, **kwargs)
+
+model = nn.Sequential(
+    nn.Linear(10, 100),
+    SaveToCpu(nn.Linear(100, 100)),
+    nn.Linear(100, 10),
+)
+
+x = torch.randn(10)
+loss = model(x).sum()
+loss.backward()
+
+
+######################################################################
+# Saving tensors to disk
+# ~~~~~~~~~~~~~~~~~~~~~~
+#
+
+
+######################################################################
+# Similarly, you may want to save those tensors to disk. Again, this is
+# achievable with those hooks.
+#
+
+
+######################################################################
+# A naive version would look like this.
+#
+
+# Naive version - HINT: Don't do this
+
+import uuid
+tmp_dir = "temp"
+
+def pack_hook(tensor):
+    name = os.path.join(tmp_dir, str(uuid.uuid4()))
+    torch.save(tensor, name)
+    return name
+
+def unpack_hook(name):
+    return torch.load(name, weights_only=True)
+
+
+######################################################################
+# The reason the above code is bad is that we are leaking files on the
+# disk and they are never cleared. Fixing this is not as trivial as it
+# seems.
+#
+
+# Incorrect version - HINT: Don't do this
+
+import uuid
+import os
+import tempfile
+tmp_dir_obj = tempfile.TemporaryDirectory()
+tmp_dir = tmp_dir_obj.name
+
+def pack_hook(tensor):
+    name = os.path.join(tmp_dir, str(uuid.uuid4()))
+    torch.save(tensor, name)
+    return name
+
+def unpack_hook(name):
+    tensor = torch.load(name, weights_only=True)
+    os.remove(name)
+    return tensor
+
+
+######################################################################
+# The reason the above code doesn’t work is that ``unpack_hook`` can be
+# called multiple times. If we delete the file during unpacking the first
+# time, it will not be available when the saved tensor is accessed a
+# second time, which will raise an error.
+#
+
+x = torch.ones(5, requires_grad=True)
+with torch.autograd.graph.saved_tensors_hooks(pack_hook, unpack_hook):
+    y = x.pow(2)
+print(y.grad_fn._saved_self)
+try:
+    print(y.grad_fn._saved_self)
+    print("Double access succeeded!")
+except:
+    print("Double access failed!")
+
+
+######################################################################
+# To fix this, we can write a version of those hooks that takes advantage
+# of the fact that PyTorch automatically releases (deletes) the saved data
+# when it is no longer needed.
+#
+
+class SelfDeletingTempFile():
+    def __init__(self):
+        self.name = os.path.join(tmp_dir, str(uuid.uuid4()))
+
+    def __del__(self):
+        os.remove(self.name)
+
+def pack_hook(tensor):
+    temp_file = SelfDeletingTempFile()
+    torch.save(tensor, temp_file.name)
+    return temp_file
+
+def unpack_hook(temp_file):
+    return torch.load(temp_file.name, weights_only=True)
+
+
+######################################################################
+# When we call ``backward``, the output of ``pack_hook`` will be deleted,
+# which causes the file to be removed, so we’re no longer leaking the
+# files.
+#
+# This can then be used in your model, in the following way:
+#
+
+# Only save on disk tensors that have size >= 1000
+SAVE_ON_DISK_THRESHOLD = 1000
+
+def pack_hook(x):
+    if x.numel() < SAVE_ON_DISK_THRESHOLD:
+        return x
+    temp_file = SelfDeletingTempFile()
+    torch.save(tensor, temp_file.name)
+    return temp_file
+
+def unpack_hook(tensor_or_sctf):
+    if isinstance(tensor_or_sctf, torch.Tensor):
+        return tensor_or_sctf
+    return torch.load(tensor_or_sctf.name)
+
+class SaveToDisk(nn.Module):
+    def __init__(self, module):
+        super().__init__()
+        self.module = module
+
+    def forward(self, *args, **kwargs):
+        with torch.autograd.graph.saved_tensors_hooks(pack_hook, unpack_hook):
+            return self.module(*args, **kwargs)
+
+net = nn.DataParallel(SaveToDisk(Model()))
+
+
+######################################################################
+# In this last example, we also demonstrate how to filter which tensors
+# should be saved (here, those whose number of elements is greater than
+# 1000) and how to combine this feature with ``nn.DataParallel``.
+#
+
+
+######################################################################
+# If you’ve made it this far, congratulations! You now know how to use
+# saved tensor hooks and how they can be useful in a few scenarios to
+# tradeoff memory for compute.
+#
diff --git a/intermediate_source/ax_multiobjective_nas_tutorial.py b/intermediate_source/ax_multiobjective_nas_tutorial.py
new file mode 100644
index 00000000000..0f1ae21a556
--- /dev/null
+++ b/intermediate_source/ax_multiobjective_nas_tutorial.py
@@ -0,0 +1,516 @@
+# -*- coding: utf-8 -*-
+"""
+Multi-Objective NAS with Ax
+==================================================
+
+**Authors:** `David Eriksson <https://github.com/dme65>`__,
+`Max Balandat <https://github.com/Balandat>`__,
+and the Adaptive Experimentation team at Meta.
+
+In this tutorial, we show how to use `Ax <https://ax.dev/>`__ to run
+multi-objective neural architecture search (NAS) for a simple neural
+network model on the popular MNIST dataset. While the underlying
+methodology would typically be used for more complicated models and
+larger datasets, we opt for a tutorial that is easily runnable
+end-to-end on a laptop in less than 20 minutes.
+
+In many NAS applications, there is a natural tradeoff between multiple
+objectives of interest. For instance, when deploying models on-device
+we may want to maximize model performance (for example, accuracy), while
+simultaneously minimizing competing metrics like power consumption,
+inference latency, or model size in order to satisfy deployment
+constraints. Often, we may be able to reduce computational requirements
+or latency of predictions substantially by accepting minimally lower
+model performance. Principled methods for exploring such tradeoffs
+efficiently are key enablers of scalable and sustainable AI, and have
+many successful applications at Meta - see for instance our
+`case study <https://research.facebook.com/blog/2021/07/optimizing-model-accuracy-and-latency-using-bayesian-multi-objective-neural-architecture-search/>`__
+on a Natural Language Understanding model.
+
+In our example here, we will tune the widths of two hidden layers,
+the learning rate, the dropout probability, the batch size, and the
+number of training epochs. The goal is to trade off performance
+(accuracy on the validation set) and model size (the number of
+model parameters).
+
+This tutorial makes use of the following PyTorch libraries:
+
+- `PyTorch Lightning <https://github.com/PyTorchLightning/pytorch-lightning>`__ (specifying the model and training loop)
+- `TorchX <https://github.com/pytorch/torchx>`__ (for running training jobs remotely / asynchronously)
+- `BoTorch <https://github.com/pytorch/botorch>`__ (the Bayesian Optimization library powering Ax's algorithms)
+"""
+
+
+######################################################################
+# Defining the TorchX App
+# -----------------------
+#
+# Our goal is to optimize the PyTorch Lightning training job defined in
+# `mnist_train_nas.py <https://github.com/pytorch/tutorials/tree/main/intermediate_source/mnist_train_nas.py>`__.
+# To do this using TorchX, we write a helper function that takes in
+# the values of the architecture and hyperparameters of the training
+# job and creates a `TorchX AppDef <https://pytorch.org/torchx/latest/basics.html>`__
+# with the appropriate settings.
+#
+
+from pathlib import Path
+
+import torchx
+
+from torchx import specs
+from torchx.components import utils
+
+
+def trainer(
+    log_path: str,
+    hidden_size_1: int,
+    hidden_size_2: int,
+    learning_rate: float,
+    epochs: int,
+    dropout: float,
+    batch_size: int,
+    trial_idx: int = -1,
+) -> specs.AppDef:
+
+    # define the log path so we can pass it to the TorchX ``AppDef``
+    if trial_idx >= 0:
+        log_path = Path(log_path).joinpath(str(trial_idx)).absolute().as_posix()
+
+    return utils.python(
+        # command line arguments to the training script
+        "--log_path",
+        log_path,
+        "--hidden_size_1",
+        str(hidden_size_1),
+        "--hidden_size_2",
+        str(hidden_size_2),
+        "--learning_rate",
+        str(learning_rate),
+        "--epochs",
+        str(epochs),
+        "--dropout",
+        str(dropout),
+        "--batch_size",
+        str(batch_size),
+        # other config options
+        name="trainer",
+        script="mnist_train_nas.py",
+        image=torchx.version.TORCHX_IMAGE,
+    )
+
+
+######################################################################
+# Setting up the Runner
+# ---------------------
+#
+# Ax’s `Runner <https://ax.dev/api/core.html#ax.core.runner.Runner>`__
+# abstraction allows writing interfaces to various backends.
+# Ax already comes with Runner for TorchX, and so we just need to
+# configure it. For the purpose of this tutorial we run jobs locally
+# in a fully asynchronous fashion.
+#
+# In order to launch them on a cluster, you can instead specify a
+# different TorchX scheduler and adjust the configuration appropriately.
+# For example, if you have a Kubernetes cluster, you just need to change the
+# scheduler from ``local_cwd`` to ``kubernetes``).
+#
+
+
+import tempfile
+from ax.runners.torchx import TorchXRunner
+
+# Make a temporary dir to log our results into
+log_dir = tempfile.mkdtemp()
+
+ax_runner = TorchXRunner(
+    tracker_base="/tmp/",
+    component=trainer,
+    # NOTE: To launch this job on a cluster instead of locally you can
+    # specify a different scheduler and adjust arguments appropriately.
+    scheduler="local_cwd",
+    component_const_params={"log_path": log_dir},
+    cfg={},
+)
+
+######################################################################
+# Setting up the ``SearchSpace``
+# ------------------------------
+#
+# First, we define our search space. Ax supports both range parameters
+# of type integer and float as well as choice parameters which can have
+# non-numerical types such as strings.
+# We will tune the hidden sizes, learning rate, dropout, and number of
+# epochs as range parameters and tune the batch size as an ordered choice
+# parameter to enforce it to be a power of 2.
+#
+
+from ax.core import (
+    ChoiceParameter,
+    ParameterType,
+    RangeParameter,
+    SearchSpace,
+)
+
+parameters = [
+    # NOTE: In a real-world setting, hidden_size_1 and hidden_size_2
+    # should probably be powers of 2, but in our simple example this
+    # would mean that ``num_params`` can't take on that many values, which
+    # in turn makes the Pareto frontier look pretty weird.
+    RangeParameter(
+        name="hidden_size_1",
+        lower=16,
+        upper=128,
+        parameter_type=ParameterType.INT,
+        log_scale=True,
+    ),
+    RangeParameter(
+        name="hidden_size_2",
+        lower=16,
+        upper=128,
+        parameter_type=ParameterType.INT,
+        log_scale=True,
+    ),
+    RangeParameter(
+        name="learning_rate",
+        lower=1e-4,
+        upper=1e-2,
+        parameter_type=ParameterType.FLOAT,
+        log_scale=True,
+    ),
+    RangeParameter(
+        name="epochs",
+        lower=1,
+        upper=4,
+        parameter_type=ParameterType.INT,
+    ),
+    RangeParameter(
+        name="dropout",
+        lower=0.0,
+        upper=0.5,
+        parameter_type=ParameterType.FLOAT,
+    ),
+    ChoiceParameter(  # NOTE: ``ChoiceParameters`` don't require log-scale
+        name="batch_size",
+        values=[32, 64, 128, 256],
+        parameter_type=ParameterType.INT,
+        is_ordered=True,
+        sort_values=True,
+    ),
+]
+
+search_space = SearchSpace(
+    parameters=parameters,
+    # NOTE: In practice, it may make sense to add a constraint
+    # hidden_size_2 <= hidden_size_1
+    parameter_constraints=[],
+)
+
+
+######################################################################
+# Setting up Metrics
+# ------------------
+#
+# Ax has the concept of a `Metric <https://ax.dev/api/core.html#metric>`__
+# that defines properties of outcomes and how observations are obtained
+# for these outcomes. This allows e.g. encoding how data is fetched from
+# some distributed execution backend and post-processed before being
+# passed as input to Ax.
+#
+# In this tutorial we will use
+# `multi-objective optimization <https://ax.dev/tutorials/multiobjective_optimization.html>`__
+# with the goal of maximizing the validation accuracy and minimizing
+# the number of model parameters. The latter represents a simple proxy
+# of model latency, which is hard to estimate accurately for small ML
+# models (in an actual application we would benchmark the latency while
+# running the model on-device).
+#
+# In our example TorchX will run the training jobs in a fully asynchronous
+# fashion locally and write the results to the ``log_dir`` based on the trial
+# index (see the ``trainer()`` function above). We will define a metric
+# class that is aware of that logging directory. By subclassing
+# `TensorboardCurveMetric <https://ax.dev/api/metrics.html?highlight=tensorboardcurvemetric#ax.metrics.tensorboard.TensorboardCurveMetric>`__
+# we get the logic to read and parse the TensorBoard logs for free.
+#
+
+from ax.metrics.tensorboard import TensorboardMetric
+from tensorboard.backend.event_processing import plugin_event_multiplexer as event_multiplexer
+
+class MyTensorboardMetric(TensorboardMetric):
+
+    # NOTE: We need to tell the new TensorBoard metric how to get the id /
+    # file handle for the TensorBoard logs from a trial. In this case
+    # our convention is to just save a separate file per trial in
+    # the prespecified log dir.
+    def _get_event_multiplexer_for_trial(self, trial):
+        mul = event_multiplexer.EventMultiplexer(max_reload_threads=20)
+        mul.AddRunsFromDirectory(Path(log_dir).joinpath(str(trial.index)).as_posix(), None)
+        mul.Reload()
+    
+        return mul
+
+    # This indicates whether the metric is queryable while the trial is
+    # still running. We don't use this in the current tutorial, but Ax
+    # utilizes this to implement trial-level early-stopping functionality.
+    @classmethod
+    def is_available_while_running(cls):
+        return False
+
+
+######################################################################
+# Now we can instantiate the metrics for accuracy and the number of
+# model parameters. Here `curve_name` is the name of the metric in the
+# TensorBoard logs, while `name` is the metric name used internally
+# by Ax. We also specify `lower_is_better` to indicate the favorable
+# direction of the two metrics.
+#
+
+val_acc = MyTensorboardMetric(
+    name="val_acc",
+    tag="val_acc",
+    lower_is_better=False,
+)
+model_num_params = MyTensorboardMetric(
+    name="num_params",
+    tag="num_params",
+    lower_is_better=True,
+)
+
+
+######################################################################
+# Setting up the ``OptimizationConfig``
+# -------------------------------------
+#
+# The way to tell Ax what it should optimize is by means of an
+# `OptimizationConfig <https://ax.dev/api/core.html#module-ax.core.optimization_config>`__.
+# Here we use a ``MultiObjectiveOptimizationConfig`` as we will
+# be performing multi-objective optimization.
+#
+# Additionally, Ax supports placing constraints on the different
+# metrics by specifying objective thresholds, which bound the region
+# of interest in the outcome space that we want to explore. For this
+# example, we will constrain the validation accuracy to be at least
+# 0.94 (94%) and the number of model parameters to be at most 80,000.
+#
+
+from ax.core import MultiObjective, Objective, ObjectiveThreshold
+from ax.core.optimization_config import MultiObjectiveOptimizationConfig
+
+
+opt_config = MultiObjectiveOptimizationConfig(
+    objective=MultiObjective(
+        objectives=[
+            Objective(metric=val_acc, minimize=False),
+            Objective(metric=model_num_params, minimize=True),
+        ],
+    ),
+    objective_thresholds=[
+        ObjectiveThreshold(metric=val_acc, bound=0.94, relative=False),
+        ObjectiveThreshold(metric=model_num_params, bound=80_000, relative=False),
+    ],
+)
+
+
+######################################################################
+# Creating the Ax Experiment
+# --------------------------
+#
+# In Ax, the `Experiment <https://ax.dev/api/core.html#ax.core.experiment.Experiment>`__
+# object is the object that stores all the information about the problem
+# setup.
+#
+# .. tip:
+#   ``Experiment`` objects can be serialized to JSON or stored to a
+#   database backend such as MySQL in order to persist and be available
+#   to load on different machines. See the the `Ax Docs <https://ax.dev/docs/storage.html>`__
+#   on the storage functionality for details.
+#
+
+from ax.core import Experiment
+
+experiment = Experiment(
+    name="torchx_mnist",
+    search_space=search_space,
+    optimization_config=opt_config,
+    runner=ax_runner,
+)
+
+######################################################################
+# Choosing the Generation Strategy
+# --------------------------------
+#
+# A `GenerationStrategy <https://ax.dev/api/modelbridge.html#ax.modelbridge.generation_strategy.GenerationStrategy>`__
+# is the abstract representation of how we would like to perform the
+# optimization. While this can be customized (if you’d like to do so, see
+# `this tutorial <https://ax.dev/tutorials/generation_strategy.html>`__),
+# in most cases Ax can automatically determine an appropriate strategy
+# based on the search space, optimization config, and the total number
+# of trials we want to run.
+#
+# Typically, Ax chooses to evaluate a number of random configurations
+# before starting a model-based Bayesian Optimization strategy.
+#
+
+
+total_trials = 48  # total evaluation budget
+
+from ax.modelbridge.dispatch_utils import choose_generation_strategy
+
+gs = choose_generation_strategy(
+    search_space=experiment.search_space,
+    optimization_config=experiment.optimization_config,
+    num_trials=total_trials,
+  )
+
+
+######################################################################
+# Configuring the Scheduler
+# -------------------------
+#
+# The ``Scheduler`` acts as the loop control for the optimization.
+# It communicates with the backend to launch trials, check their status,
+# and retrieve results. In the case of this tutorial, it is simply reading
+# and parsing the locally saved logs. In a remote execution setting,
+# it would call APIs. The following illustration from the Ax
+# `Scheduler tutorial <https://ax.dev/tutorials/scheduler.html>`__
+# summarizes how the Scheduler interacts with external systems used to run
+# trial evaluations:
+#
+# .. image:: ../../_static/img/ax_scheduler_illustration.png
+#
+#
+# The ``Scheduler`` requires the ``Experiment`` and the ``GenerationStrategy``.
+# A set of options can be passed in via ``SchedulerOptions``. Here, we
+# configure the number of total evaluations as well as ``max_pending_trials``,
+# the maximum number of trials that should run concurrently. In our
+# local setting, this is the number of training jobs running as individual
+# processes, while in a remote execution setting, this would be the number
+# of machines you want to use in parallel.
+#
+
+
+from ax.service.scheduler import Scheduler, SchedulerOptions
+
+scheduler = Scheduler(
+    experiment=experiment,
+    generation_strategy=gs,
+    options=SchedulerOptions(
+        total_trials=total_trials, max_pending_trials=4
+    ),
+)
+
+
+######################################################################
+# Running the optimization
+# ------------------------
+#
+# Now that everything is configured, we can let Ax run the optimization
+# in a fully automated fashion. The Scheduler will periodically check
+# the logs for the status of all currently running trials, and if a
+# trial completes the scheduler will update its status on the
+# experiment and fetch the observations needed for the Bayesian
+# optimization algorithm.
+#
+
+scheduler.run_all_trials()
+
+
+######################################################################
+# Evaluating the results
+# ----------------------
+#
+# We can now inspect the result of the optimization using helper
+# functions and visualizations included with Ax.
+
+######################################################################
+# First, we generate a dataframe with a summary of the results
+# of the experiment. Each row in this dataframe corresponds to a
+# trial (that is, a training job that was run), and contains information
+# on the status of the trial, the parameter configuration that was
+# evaluated, and the metric values that were observed. This provides
+# an easy way to sanity check the optimization.
+#
+
+from ax.service.utils.report_utils import exp_to_df
+
+df = exp_to_df(experiment)
+df.head(10)
+
+
+######################################################################
+# We can also visualize the Pareto frontier of tradeoffs between the
+# validation accuracy and the number of model parameters.
+#
+# .. tip::
+#   Ax uses Plotly to produce interactive plots, which allow you to
+#   do things like zoom, crop, or hover in order to view details
+#   of components of the plot. Try it out, and take a look at the
+#   `visualization tutorial <https://ax.dev/tutorials/visualizations.html>`__
+#   if you'd like to learn more).
+#
+# The final optimization results are shown in the figure below where
+# the color corresponds to the iteration number for each trial.
+# We see that our method was able to successfully explore the
+# trade-offs and found both large models with high validation
+# accuracy as well as small models with comparatively lower
+# validation accuracy.
+#
+
+from ax.service.utils.report_utils import _pareto_frontier_scatter_2d_plotly
+
+_pareto_frontier_scatter_2d_plotly(experiment)
+
+
+######################################################################
+# To better understand what our surrogate models have learned about
+# the black box objectives, we can take a look at the leave-one-out
+# cross validation results. Since our models are Gaussian Processes,
+# they not only provide point predictions but also uncertainty estimates
+# about these predictions. A good model means that the predicted means
+# (the points in the figure) are close to the 45 degree line and that the
+# confidence intervals cover the 45 degree line with the expected frequency
+# (here we use 95% confidence intervals, so we would expect them to contain
+# the true observation 95% of the time).
+#
+# As the figures below show, the model size (``num_params``) metric is
+# much easier to model than the validation accuracy (``val_acc``) metric.
+#
+
+from ax.modelbridge.cross_validation import compute_diagnostics, cross_validate
+from ax.plot.diagnostic import interact_cross_validation_plotly
+from ax.utils.notebook.plotting import init_notebook_plotting, render
+
+cv = cross_validate(model=gs.model)  # The surrogate model is stored on the ``GenerationStrategy``
+compute_diagnostics(cv)
+
+interact_cross_validation_plotly(cv)
+
+
+######################################################################
+# We can also make contour plots to better understand how the different
+# objectives depend on two of the input parameters. In the figure below,
+# we show the validation accuracy predicted by the model as a function
+# of the two hidden sizes. The validation accuracy clearly increases
+# as the hidden sizes increase.
+#
+
+from ax.plot.contour import interact_contour_plotly
+
+interact_contour_plotly(model=gs.model, metric_name="val_acc")
+
+
+######################################################################
+# Similarly, we show the number of model parameters as a function of
+# the hidden sizes in the figure below and see that it also increases
+# as a function of the hidden sizes (the dependency on ``hidden_size_1``
+# is much larger).
+
+interact_contour_plotly(model=gs.model, metric_name="num_params")
+
+
+######################################################################
+# Acknowledgments
+# ----------------
+#
+# We thank the TorchX team (in particular Kiuk Chung and Tristan Rice)
+# for their help with integrating TorchX with Ax.
+#
diff --git a/intermediate_source/char_rnn_classification_tutorial.py b/intermediate_source/char_rnn_classification_tutorial.py
index f6ca3815a11..6d56029f5d5 100644
--- a/intermediate_source/char_rnn_classification_tutorial.py
+++ b/intermediate_source/char_rnn_classification_tutorial.py
@@ -1,150 +1,131 @@
 # -*- coding: utf-8 -*-
 """
-Classifying Names with a Character-Level RNN
-*********************************************
-**Author**: `Sean Robertson <https://github.com/spro/practical-pytorch>`_
+NLP From Scratch: Classifying Names with a Character-Level RNN
+**************************************************************
+**Author**: `Sean Robertson <https://github.com/spro>`_
 
-We will be building and training a basic character-level RNN to classify
-words. A character-level RNN reads words as a series of characters -
+This tutorials is part of a three-part series:
+
+* `NLP From Scratch: Classifying Names with a Character-Level RNN <https://pytorch.org/tutorials/intermediate/char_rnn_classification_tutorial.html>`__
+* `NLP From Scratch: Generating Names with a Character-Level RNN <https://pytorch.org/tutorials/intermediate/char_rnn_generation_tutorial.html>`__
+* `NLP From Scratch: Translation with a Sequence to Sequence Network and Attention <https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html>`__
+
+We will be building and training a basic character-level Recurrent Neural
+Network (RNN) to classify words. This tutorial, along with two other
+Natural Language Processing (NLP) "from scratch" tutorials
+:doc:`/intermediate/char_rnn_generation_tutorial` and
+:doc:`/intermediate/seq2seq_translation_tutorial`, show how to
+preprocess data to model NLP. In particular, these tutorials show how
+preprocessing to model NLP works at a low level.
+
+A character-level RNN reads words as a series of characters -
 outputting a prediction and "hidden state" at each step, feeding its
 previous hidden state into each next step. We take the final prediction
 to be the output, i.e. which class the word belongs to.
 
 Specifically, we'll train on a few thousand surnames from 18 languages
 of origin, and predict which language a name is from based on the
-spelling:
-
-::
+spelling.
 
-    $ python predict.py Hinton
-    (-0.47) Scottish
-    (-1.52) English
-    (-3.57) Irish
+Recommended Preparation
+=======================
 
-    $ python predict.py Schmidhuber
-    (-0.19) German
-    (-2.48) Czech
-    (-2.68) Dutch
+Before starting this tutorial it is recommended that you have installed PyTorch,
+and have a basic understanding of Python programming language and Tensors:
 
-
-**Recommended Reading:**
-
-I assume you have at least installed PyTorch, know Python, and
-understand Tensors:
-
--  http://pytorch.org/ For installation instructions
+-  https://pytorch.org/ For installation instructions
 -  :doc:`/beginner/deep_learning_60min_blitz` to get started with PyTorch in general
+   and learn the basics of Tensors
 -  :doc:`/beginner/pytorch_with_examples` for a wide and deep overview
 -  :doc:`/beginner/former_torchies_tutorial` if you are former Lua Torch user
 
 It would also be useful to know about RNNs and how they work:
 
 -  `The Unreasonable Effectiveness of Recurrent Neural
-   Networks <http://karpathy.github.io/2015/05/21/rnn-effectiveness/>`__
+   Networks <https://karpathy.github.io/2015/05/21/rnn-effectiveness/>`__
    shows a bunch of real life examples
 -  `Understanding LSTM
-   Networks <http://colah.github.io/posts/2015-08-Understanding-LSTMs/>`__
+   Networks <https://colah.github.io/posts/2015-08-Understanding-LSTMs/>`__
    is about LSTMs specifically but also informative about RNNs in
    general
+"""
+######################################################################
+# Preparing Torch
+# ==========================
+#
+# Set up torch to default to the right device use GPU acceleration depending on your hardware (CPU or CUDA).
+#
 
-Preparing the Data
-==================
-
-.. Note::
-   Download the data from
-   `here <https://download.pytorch.org/tutorial/data.zip>`_
-   and extract it to the current directory.
-
-Included in the ``data/names`` directory are 18 text files named as
-"[Language].txt". Each file contains a bunch of names, one name per
-line, mostly romanized (but we still need to convert from Unicode to
-ASCII).
+import torch
 
-We'll end up with a dictionary of lists of names per language,
-``{language: [names ...]}``. The generic variables "category" and "line"
-(for language and name in our case) are used for later extensibility.
-"""
-from __future__ import unicode_literals, print_function, division
-from io import open
-import glob
+# Check if CUDA is available
+device = torch.device('cpu')
+if torch.cuda.is_available():
+    device = torch.device('cuda')
 
-def findFiles(path): return glob.glob(path)
+torch.set_default_device(device)
+print(f"Using device = {torch.get_default_device()}")
 
-print(findFiles('data/names/*.txt'))
+######################################################################
+# Preparing the Data
+# ==================
+#
+# Download the data from `here <https://download.pytorch.org/tutorial/data.zip>`__
+# and extract it to the current directory.
+#
+# Included in the ``data/names`` directory are 18 text files named as
+# ``[Language].txt``. Each file contains a bunch of names, one name per
+# line, mostly romanized (but we still need to convert from Unicode to
+# ASCII).
+#
+# The first step is to define and clean our data. Initially, we need to convert Unicode to plain ASCII to
+# limit the RNN input layers. This is accomplished by converting Unicode strings to ASCII and allowing only a small set of allowed characters.
 
-import unicodedata
 import string
+import unicodedata
 
-all_letters = string.ascii_letters + " .,;'"
-n_letters = len(all_letters)
+# We can use "_" to represent an out-of-vocabulary character, that is, any character we are not handling in our model
+allowed_characters = string.ascii_letters + " .,;'" + "_"
+n_letters = len(allowed_characters)
 
-# Turn a Unicode string to plain ASCII, thanks to http://stackoverflow.com/a/518232/2809427
+# Turn a Unicode string to plain ASCII, thanks to https://stackoverflow.com/a/518232/2809427
 def unicodeToAscii(s):
     return ''.join(
         c for c in unicodedata.normalize('NFD', s)
         if unicodedata.category(c) != 'Mn'
-        and c in all_letters
+        and c in allowed_characters
     )
 
-print(unicodeToAscii('Ślusàrski'))
-
-# Build the category_lines dictionary, a list of names per language
-category_lines = {}
-all_categories = []
-
-# Read a file and split into lines
-def readLines(filename):
-    lines = open(filename, encoding='utf-8').read().strip().split('\n')
-    return [unicodeToAscii(line) for line in lines]
-
-for filename in findFiles('data/names/*.txt'):
-    category = filename.split('/')[-1].split('.')[0]
-    all_categories.append(category)
-    lines = readLines(filename)
-    category_lines[category] = lines
-
-n_categories = len(all_categories)
-
-
-######################################################################
-# Now we have ``category_lines``, a dictionary mapping each category
-# (language) to a list of lines (names). We also kept track of
-# ``all_categories`` (just a list of languages) and ``n_categories`` for
-# later reference.
-# 
-
-print(category_lines['Italian'][:5])
+#########################
+# Here's an example of converting a unicode alphabet name to plain ASCII. This simplifies the input layer
+#
 
+print (f"converting 'Ślusàrski' to {unicodeToAscii('Ślusàrski')}")
 
 ######################################################################
 # Turning Names into Tensors
-# --------------------------
-# 
+# ==========================
+#
 # Now that we have all the names organized, we need to turn them into
 # Tensors to make any use of them.
-# 
+#
 # To represent a single letter, we use a "one-hot vector" of size
 # ``<1 x n_letters>``. A one-hot vector is filled with 0s except for a 1
 # at index of the current letter, e.g. ``"b" = <0 1 0 0 0 ...>``.
-# 
+#
 # To make a word we join a bunch of those into a 2D matrix
 # ``<line_length x 1 x n_letters>``.
-# 
+#
 # That extra 1 dimension is because PyTorch assumes everything is in
 # batches - we're just using a batch size of 1 here.
-# 
-
-import torch
 
 # Find letter index from all_letters, e.g. "a" = 0
 def letterToIndex(letter):
-    return all_letters.find(letter)
-
-# Just for demonstration, turn a letter into a <1 x n_letters> Tensor
-def letterToTensor(letter):
-    tensor = torch.zeros(1, n_letters)
-    tensor[0][letterToIndex(letter)] = 1
-    return tensor
+    # return our out-of-vocabulary character if we encounter a letter unknown to our model
+    if letter not in allowed_characters:
+        return allowed_characters.find("_")
+    else:
+        return allowed_characters.find(letter)
 
 # Turn a line into a <line_length x 1 x n_letters>,
 # or an array of one-hot letter vectors
@@ -154,375 +135,309 @@ def lineToTensor(line):
         tensor[li][0][letterToIndex(letter)] = 1
     return tensor
 
-print(letterToTensor('J'))
+#########################
+# Here are some examples of how to use ``lineToTensor()`` for a single and multiple character string.
 
-print(lineToTensor('Jones').size())
+print (f"The letter 'a' becomes {lineToTensor('a')}") #notice that the first position in the tensor = 1
+print (f"The name 'Ahn' becomes {lineToTensor('Ahn')}") #notice 'A' sets the 27th index to 1
 
+#########################
+# Congratulations, you have built the foundational tensor objects for this learning task! You can use a similar approach
+# for other RNN tasks with text.
+#
+# Next, we need to combine all our examples into a dataset so we can train, test and validate our models. For this,
+# we will use the `Dataset and DataLoader <https://pytorch.org/tutorials/beginner/basics/data_tutorial.html>`__ classes
+# to hold our dataset. Each Dataset needs to implement three functions: ``__init__``, ``__len__``, and ``__getitem__``.
+from io import open
+import glob
+import os
+import time
 
-######################################################################
-# Creating the Network
-# ====================
-# 
-# Before autograd, creating a recurrent neural network in Torch involved
-# cloning the parameters of a layer over several timesteps. The layers
-# held hidden state and gradients which are now entirely handled by the
-# graph itself. This means you can implement a RNN in a very "pure" way,
-# as regular feed-forward layers.
-# 
-# This RNN module (mostly copied from `the PyTorch for Torch users
-# tutorial <https://github.com/pytorch/tutorials/blob/master/Introduction%20to%20PyTorch%20for%20former%20Torchies.ipynb>`__)
-# is just 2 linear layers which operate on an input and hidden state, with
-# a LogSoftmax layer after the output.
-# 
-# .. figure:: https://i.imgur.com/Z2xbySO.png
-#    :alt: 
-# 
-# 
-
-import torch.nn as nn
-from torch.autograd import Variable
-
-class RNN(nn.Module):
-    def __init__(self, input_size, hidden_size, output_size):
-        super(RNN, self).__init__()
-        
-        self.hidden_size = hidden_size
-        
-        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
-        self.i2o = nn.Linear(input_size + hidden_size, output_size)
-        self.softmax = nn.LogSoftmax()
-    
-    def forward(self, input, hidden):
-        combined = torch.cat((input, hidden), 1)
-        hidden = self.i2h(combined)
-        output = self.i2o(combined)
-        output = self.softmax(output)
-        return output, hidden
-
-    def initHidden(self):
-        return Variable(torch.zeros(1, self.hidden_size))
+import torch
+from torch.utils.data import Dataset
 
-n_hidden = 128
-rnn = RNN(n_letters, n_hidden, n_categories)
+class NamesDataset(Dataset):
 
+    def __init__(self, data_dir):
+        self.data_dir = data_dir #for provenance of the dataset
+        self.load_time = time.localtime #for provenance of the dataset
+        labels_set = set() #set of all classes
 
-######################################################################
-# To run a step of this network we need to pass an input (in our case, the
-# Tensor for the current letter) and a previous hidden state (which we
-# initialize as zeros at first). We'll get back the output (probability of
-# each language) and a next hidden state (which we keep for the next
-# step).
-# 
-# Remember that PyTorch modules operate on Variables rather than straight
-# up Tensors.
-# 
+        self.data = []
+        self.data_tensors = []
+        self.labels = []
+        self.labels_tensors = []
 
-input = Variable(letterToTensor('A'))
-hidden = Variable(torch.zeros(1, n_hidden))
+        #read all the ``.txt`` files in the specified directory
+        text_files = glob.glob(os.path.join(data_dir, '*.txt'))
+        for filename in text_files:
+            label = os.path.splitext(os.path.basename(filename))[0]
+            labels_set.add(label)
+            lines = open(filename, encoding='utf-8').read().strip().split('\n')
+            for name in lines:
+                self.data.append(name)
+                self.data_tensors.append(lineToTensor(name))
+                self.labels.append(label)
 
-output, next_hidden = rnn(input, hidden)
+        #Cache the tensor representation of the labels
+        self.labels_uniq = list(labels_set)
+        for idx in range(len(self.labels)):
+            temp_tensor = torch.tensor([self.labels_uniq.index(self.labels[idx])], dtype=torch.long)
+            self.labels_tensors.append(temp_tensor)
 
+    def __len__(self):
+        return len(self.data)
 
-######################################################################
-# For the sake of efficiency we don't want to be creating a new Tensor for
-# every step, so we will use ``lineToTensor`` instead of
-# ``letterToTensor`` and use slices. This could be further optimized by
-# pre-computing batches of Tensors.
-# 
+    def __getitem__(self, idx):
+        data_item = self.data[idx]
+        data_label = self.labels[idx]
+        data_tensor = self.data_tensors[idx]
+        label_tensor = self.labels_tensors[idx]
 
-input = Variable(lineToTensor('Albert'))
-hidden = Variable(torch.zeros(1, n_hidden))
+        return label_tensor, data_tensor, data_label, data_item
 
-output, next_hidden = rnn(input[0], hidden)
-print(output)
 
+#########################
+#Here we can load our example data into the ``NamesDataset``
 
-######################################################################
-# As you can see the output is a ``<1 x n_categories>`` Tensor, where
-# every item is the likelihood of that category (higher is more likely).
-# 
+alldata = NamesDataset("data/names")
+print(f"loaded {len(alldata)} items of data")
+print(f"example = {alldata[0]}")
 
+#########################
+#Using the dataset object allows us to easily split the data into train and test sets. Here we create a 80/20
+# split but the ``torch.utils.data`` has more useful utilities. Here we specify a generator since we need to use the
+#same device as PyTorch defaults to above.
 
-######################################################################
-# 
-# Training
-# ========
-# Preparing for Training
-# ----------------------
-# 
-# Before going into training we should make a few helper functions. The
-# first is to interpret the output of the network, which we know to be a
-# likelihood of each category. We can use ``Tensor.topk`` to get the index
-# of the greatest value:
-# 
+train_set, test_set = torch.utils.data.random_split(alldata, [.85, .15], generator=torch.Generator(device=device).manual_seed(2024))
 
-def categoryFromOutput(output):
-    top_n, top_i = output.data.topk(1) # Tensor out of Variable with .data
-    category_i = top_i[0][0]
-    return all_categories[category_i], category_i
+print(f"train examples = {len(train_set)}, validation examples = {len(test_set)}")
 
-print(categoryFromOutput(output))
+#########################
+# Now we have a basic dataset containing **20074** examples where each example is a pairing of label and name. We have also
+#split the dataset into training and testing so we can validate the model that we build.
 
 
 ######################################################################
-# We will also want a quick way to get a training example (a name and its
-# language):
-# 
+# Creating the Network
+# ====================
+#
+# Before autograd, creating a recurrent neural network in Torch involved
+# cloning the parameters of a layer over several timesteps. The layers
+# held hidden state and gradients which are now entirely handled by the
+# graph itself. This means you can implement a RNN in a very "pure" way,
+# as regular feed-forward layers.
+#
+# This CharRNN class implements an RNN with three components.
+# First, we use the `nn.RNN implementation <https://pytorch.org/docs/stable/generated/torch.nn.RNN.html>`__.
+# Next, we define a layer that maps the RNN hidden layers to our output. And finally, we apply a ``softmax`` function. Using ``nn.RNN``
+# leads to a significant improvement in performance, such as cuDNN-accelerated kernels, versus implementing
+# each layer as a ``nn.Linear``. It also simplifies the implementation in ``forward()``.
+#
 
-import random
+import torch.nn as nn
+import torch.nn.functional as F
 
-def randomChoice(l):
-    return l[random.randint(0, len(l) - 1)]
+class CharRNN(nn.Module):
+    def __init__(self, input_size, hidden_size, output_size):
+        super(CharRNN, self).__init__()
 
-def randomTrainingExample():
-    category = randomChoice(all_categories)
-    line = randomChoice(category_lines[category])
-    category_tensor = Variable(torch.LongTensor([all_categories.index(category)]))
-    line_tensor = Variable(lineToTensor(line))
-    return category, line, category_tensor, line_tensor
+        self.rnn = nn.RNN(input_size, hidden_size)
+        self.h2o = nn.Linear(hidden_size, output_size)
+        self.softmax = nn.LogSoftmax(dim=1)
 
-for i in range(10):
-    category, line, category_tensor, line_tensor = randomTrainingExample()
-    print('category =', category, '/ line =', line)
+    def forward(self, line_tensor):
+        rnn_out, hidden = self.rnn(line_tensor)
+        output = self.h2o(hidden[0])
+        output = self.softmax(output)
 
+        return output
 
-######################################################################
-# Training the Network
-# --------------------
-# 
-# Now all it takes to train this network is show it a bunch of examples,
-# have it make guesses, and tell it if it's wrong.
-# 
-# For the loss function ``nn.NLLLoss`` is appropriate, since the last
-# layer of the RNN is ``nn.LogSoftmax``.
-# 
 
-criterion = nn.NLLLoss()
+###########################
+# We can then create an RNN with 58 input nodes, 128 hidden nodes, and 18 outputs:
 
+n_hidden = 128
+rnn = CharRNN(n_letters, n_hidden, len(alldata.labels_uniq))
+print(rnn)
 
 ######################################################################
-# Each loop of training will:
-# 
-# -  Create input and target tensors
-# -  Create a zeroed initial hidden state
-# -  Read each letter in and
-# 
-#    -  Keep hidden state for next letter
-# 
-# -  Compare final output to target
-# -  Back-propagate
-# -  Return the output and loss
-# 
-
-learning_rate = 0.005 # If you set this too high, it might explode. If too low, it might not learn
+# After that we can pass our Tensor to the RNN to obtain a predicted output. Subsequently,
+# we use a helper function, ``label_from_output``, to derive a text label for the class.
 
-def train(category_tensor, line_tensor):
-    hidden = rnn.initHidden()
-
-    rnn.zero_grad()
-
-    for i in range(line_tensor.size()[0]):
-        output, hidden = rnn(line_tensor[i], hidden)
-
-    loss = criterion(output, category_tensor)
-    loss.backward()
-
-    # Add parameters' gradients to their values, multiplied by learning rate
-    for p in rnn.parameters():
-        p.data.add_(-learning_rate, p.grad.data)
-
-    return output, loss.data[0]
+def label_from_output(output, output_labels):
+    top_n, top_i = output.topk(1)
+    label_i = top_i[0].item()
+    return output_labels[label_i], label_i
 
+input = lineToTensor('Albert')
+output = rnn(input) #this is equivalent to ``output = rnn.forward(input)``
+print(output)
+print(label_from_output(output, alldata.labels_uniq))
 
 ######################################################################
-# Now we just have to run that with a bunch of examples. Since the
-# ``train`` function returns both the output and loss we can print its
-# guesses and also keep track of loss for plotting. Since there are 1000s
-# of examples we print only every ``print_every`` examples, and take an
-# average of the loss.
-# 
-
-import time
-import math
+#
+# Training
+# ========
 
-n_iters = 100000
-print_every = 5000
-plot_every = 1000
 
+######################################################################
+# Training the Network
+# --------------------
+#
+# Now all it takes to train this network is show it a bunch of examples,
+# have it make guesses, and tell it if it's wrong.
+#
+# We do this by defining a ``train()`` function which trains the model on a given dataset using minibatches. RNNs
+# RNNs are trained similarly to other networks; therefore, for completeness, we include a batched training method here.
+# The loop (``for i in batch``) computes the losses for each of the items in the batch before adjusting the
+# weights. This operation is repeated until the number of epochs is reached.
 
+import random
+import numpy as np
+
+def train(rnn, training_data, n_epoch = 10, n_batch_size = 64, report_every = 50, learning_rate = 0.2, criterion = nn.NLLLoss()):
+    """
+    Learn on a batch of training_data for a specified number of iterations and reporting thresholds
+    """
+    # Keep track of losses for plotting
+    current_loss = 0
+    all_losses = []
+    rnn.train()
+    optimizer = torch.optim.SGD(rnn.parameters(), lr=learning_rate)
+
+    start = time.time()
+    print(f"training on data set with n = {len(training_data)}")
+
+    for iter in range(1, n_epoch + 1):
+        rnn.zero_grad() # clear the gradients
+
+        # create some minibatches
+        # we cannot use dataloaders because each of our names is a different length
+        batches = list(range(len(training_data)))
+        random.shuffle(batches)
+        batches = np.array_split(batches, len(batches) //n_batch_size )
+
+        for idx, batch in enumerate(batches):
+            batch_loss = 0
+            for i in batch: #for each example in this batch
+                (label_tensor, text_tensor, label, text) = training_data[i]
+                output = rnn.forward(text_tensor)
+                loss = criterion(output, label_tensor)
+                batch_loss += loss
+
+            # optimize parameters
+            batch_loss.backward()
+            nn.utils.clip_grad_norm_(rnn.parameters(), 3)
+            optimizer.step()
+            optimizer.zero_grad()
+
+            current_loss += batch_loss.item() / len(batch)
+
+        all_losses.append(current_loss / len(batches) )
+        if iter % report_every == 0:
+            print(f"{iter} ({iter / n_epoch:.0%}): \t average batch loss = {all_losses[-1]}")
+        current_loss = 0
 
-# Keep track of losses for plotting
-current_loss = 0
-all_losses = []
+    return all_losses
 
-def timeSince(since):
-    now = time.time()
-    s = now - since
-    m = math.floor(s / 60)
-    s -= m * 60
-    return '%dm %ds' % (m, s)
+##########################################################################
+# We can now train a dataset with minibatches for a specified number of epochs. The number of epochs for this
+# example is reduced to speed up the build. You can get better results with different parameters.
 
 start = time.time()
-
-for iter in range(1, n_iters + 1):
-    category, line, category_tensor, line_tensor = randomTrainingExample()
-    output, loss = train(category_tensor, line_tensor)
-    current_loss += loss
-
-    # Print iter number, loss, name and guess
-    if iter % print_every == 0:
-        guess, guess_i = categoryFromOutput(output)
-        correct = '✓' if guess == category else '✗ (%s)' % category
-        print('%d %d%% (%s) %.4f %s / %s %s' % (iter, iter / n_iters * 100, timeSince(start), loss, line, guess, correct))
-
-    # Add current loss avg to list of losses
-    if iter % plot_every == 0:
-        all_losses.append(current_loss / plot_every)
-        current_loss = 0
-
+all_losses = train(rnn, train_set, n_epoch=27, learning_rate=0.15, report_every=5)
+end = time.time()
+print(f"training took {end-start}s")
 
 ######################################################################
 # Plotting the Results
 # --------------------
-# 
+#
 # Plotting the historical loss from ``all_losses`` shows the network
 # learning:
-# 
+#
 
 import matplotlib.pyplot as plt
 import matplotlib.ticker as ticker
 
 plt.figure()
 plt.plot(all_losses)
-
+plt.show()
 
 ######################################################################
 # Evaluating the Results
 # ======================
-# 
+#
 # To see how well the network performs on different categories, we will
 # create a confusion matrix, indicating for every actual language (rows)
 # which language the network guesses (columns). To calculate the confusion
 # matrix a bunch of samples are run through the network with
 # ``evaluate()``, which is the same as ``train()`` minus the backprop.
-# 
-
-# Keep track of correct guesses in a confusion matrix
-confusion = torch.zeros(n_categories, n_categories)
-n_confusion = 10000
-
-# Just return an output given a line
-def evaluate(line_tensor):
-    hidden = rnn.initHidden()
-    
-    for i in range(line_tensor.size()[0]):
-        output, hidden = rnn(line_tensor[i], hidden)
-    
-    return output
-
-# Go through a bunch of examples and record which are correctly guessed
-for i in range(n_confusion):
-    category, line, category_tensor, line_tensor = randomTrainingExample()
-    output = evaluate(line_tensor)
-    guess, guess_i = categoryFromOutput(output)
-    category_i = all_categories.index(category)
-    confusion[category_i][guess_i] += 1
-
-# Normalize by dividing every row by its sum
-for i in range(n_categories):
-    confusion[i] = confusion[i] / confusion[i].sum()
-
-# Set up plot
-fig = plt.figure()
-ax = fig.add_subplot(111)
-cax = ax.matshow(confusion.numpy())
-fig.colorbar(cax)
-
-# Set up axes
-ax.set_xticklabels([''] + all_categories, rotation=90)
-ax.set_yticklabels([''] + all_categories)
-
-# Force label at every tick
-ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
-ax.yaxis.set_major_locator(ticker.MultipleLocator(1))
-
-# sphinx_gallery_thumbnail_number = 2
-plt.show()
+#
 
+def evaluate(rnn, testing_data, classes):
+    confusion = torch.zeros(len(classes), len(classes))
 
-######################################################################
-# You can pick out bright spots off the main axis that show which
-# languages it guesses incorrectly, e.g. Chinese for Korean, and Spanish
-# for Italian. It seems to do very well with Greek, and very poorly with
-# English (perhaps because of overlap with other languages).
-# 
+    rnn.eval() #set to eval mode
+    with torch.no_grad(): # do not record the gradients during eval phase
+        for i in range(len(testing_data)):
+            (label_tensor, text_tensor, label, text) = testing_data[i]
+            output = rnn(text_tensor)
+            guess, guess_i = label_from_output(output, classes)
+            label_i = classes.index(label)
+            confusion[label_i][guess_i] += 1
 
+    # Normalize by dividing every row by its sum
+    for i in range(len(classes)):
+        denom = confusion[i].sum()
+        if denom > 0:
+            confusion[i] = confusion[i] / denom
 
-######################################################################
-# Running on User Input
-# ---------------------
-# 
+    # Set up plot
+    fig = plt.figure()
+    ax = fig.add_subplot(111)
+    cax = ax.matshow(confusion.cpu().numpy()) #numpy uses cpu here so we need to use a cpu version
+    fig.colorbar(cax)
+
+    # Set up axes
+    ax.set_xticks(np.arange(len(classes)), labels=classes, rotation=90)
+    ax.set_yticks(np.arange(len(classes)), labels=classes)
+
+    # Force label at every tick
+    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
+    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))
 
-def predict(input_line, n_predictions=3):
-    print('\n> %s' % input_line)
-    output = evaluate(Variable(lineToTensor(input_line)))
+    # sphinx_gallery_thumbnail_number = 2
+    plt.show()
 
-    # Get top N categories
-    topv, topi = output.data.topk(n_predictions, 1, True)
-    predictions = []
 
-    for i in range(n_predictions):
-        value = topv[0][i]
-        category_index = topi[0][i]
-        print('(%.2f) %s' % (value, all_categories[category_index]))
-        predictions.append([value, all_categories[category_index]])
 
-predict('Dovesky')
-predict('Jackson')
-predict('Satoshi')
+evaluate(rnn, test_set, classes=alldata.labels_uniq)
 
 
 ######################################################################
-# The final versions of the scripts `in the Practical PyTorch
-# repo <https://github.com/spro/practical-pytorch/tree/master/char-rnn-classification>`__
-# split the above code into a few files:
-# 
-# -  ``data.py`` (loads files)
-# -  ``model.py`` (defines the RNN)
-# -  ``train.py`` (runs training)
-# -  ``predict.py`` (runs ``predict()`` with command line arguments)
-# -  ``server.py`` (serve prediction as a JSON API with bottle.py)
-# 
-# Run ``train.py`` to train and save the network.
-# 
-# Run ``predict.py`` with a name to view predictions:
-# 
-# ::
-# 
-#     $ python predict.py Hazaki
-#     (-0.42) Japanese
-#     (-1.39) Polish
-#     (-3.51) Czech
-# 
-# Run ``server.py`` and visit http://localhost:5533/Yourname to get JSON
-# output of predictions.
-# 
+# You can pick out bright spots off the main axis that show which
+# languages it guesses incorrectly, e.g. Chinese for Korean, and Spanish
+# for Italian. It seems to do very well with Greek, and very poorly with
+# English (perhaps because of overlap with other languages).
+#
 
 
 ######################################################################
 # Exercises
 # =========
-# 
-# -  Try with a different dataset of line -> category, for example:
-# 
-#    -  Any word -> language
-#    -  First name -> gender
-#    -  Character name -> writer
-#    -  Page title -> blog or subreddit
-# 
+#
 # -  Get better results with a bigger and/or better shaped network
-# 
-#    -  Add more linear layers
+#
+#    -  Adjust the hyperparameters to enhance performance, such as changing the number of epochs, batch size, and learning rate
 #    -  Try the ``nn.LSTM`` and ``nn.GRU`` layers
+#    -  Modify the size of the layers, such as increasing or decreasing the number of hidden nodes or adding additional linear layers
 #    -  Combine multiple of these RNNs as a higher level network
-# 
+#
+# -  Try with a different dataset of line -> label, for example:
+#
+#    -  Any word -> language
+#    -  First name -> gender
+#    -  Character name -> writer
+#    -  Page title -> blog or subreddit
\ No newline at end of file
diff --git a/intermediate_source/char_rnn_generation_tutorial.py b/intermediate_source/char_rnn_generation_tutorial.py
index 19ab961a9bc..50a6afa11b7 100644
--- a/intermediate_source/char_rnn_generation_tutorial.py
+++ b/intermediate_source/char_rnn_generation_tutorial.py
@@ -1,14 +1,21 @@
 # -*- coding: utf-8 -*-
 """
-Generating Names with a Character-Level RNN
-*******************************************
-**Author**: `Sean Robertson <https://github.com/spro/practical-pytorch>`_
+NLP From Scratch: Generating Names with a Character-Level RNN
+*************************************************************
+**Author**: `Sean Robertson <https://github.com/spro>`_
 
-In the :doc:`last tutorial </intermediate/char_rnn_classification_tutorial>`
+This tutorials is part of a three-part series:
+
+* `NLP From Scratch: Classifying Names with a Character-Level RNN <https://pytorch.org/tutorials/intermediate/char_rnn_classification_tutorial.html>`__
+* `NLP From Scratch: Generating Names with a Character-Level RNN <https://pytorch.org/tutorials/intermediate/char_rnn_generation_tutorial.html>`__
+* `NLP From Scratch: Translation with a Sequence to Sequence Network and Attention <https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html>`__
+
+This is our second of three tutorials on "NLP From Scratch".
+In the `first tutorial </tutorials/intermediate/char_rnn_classification_tutorial>`_
 we used a RNN to classify names into their language of origin. This time
 we'll turn around and generate names from languages.
 
-::
+.. code-block:: sh
 
     > python sample.py Russian RUS
     Rovakov
@@ -42,7 +49,7 @@
 I assume you have at least installed PyTorch, know Python, and
 understand Tensors:
 
--  http://pytorch.org/ For installation instructions
+-  https://pytorch.org/ For installation instructions
 -  :doc:`/beginner/deep_learning_60min_blitz` to get started with PyTorch in general
 -  :doc:`/beginner/pytorch_with_examples` for a wide and deep overview
 -  :doc:`/beginner/former_torchies_tutorial` if you are former Lua Torch user
@@ -50,10 +57,10 @@
 It would also be useful to know about RNNs and how they work:
 
 -  `The Unreasonable Effectiveness of Recurrent Neural
-   Networks <http://karpathy.github.io/2015/05/21/rnn-effectiveness/>`__
+   Networks <https://karpathy.github.io/2015/05/21/rnn-effectiveness/>`__
    shows a bunch of real life examples
 -  `Understanding LSTM
-   Networks <http://colah.github.io/posts/2015-08-Understanding-LSTMs/>`__
+   Networks <https://colah.github.io/posts/2015-08-Understanding-LSTMs/>`__
    is about LSTMs specifically but also informative about RNNs in
    general
 
@@ -63,7 +70,7 @@
 Preparing the Data
 ==================
 
-.. Note::
+.. note::
    Download the data from
    `here <https://download.pytorch.org/tutorial/data.zip>`_
    and extract it to the current directory.
@@ -74,9 +81,9 @@
 and end up with a dictionary ``{language: [names ...]}``.
 
 """
-from __future__ import unicode_literals, print_function, division
 from io import open
 import glob
+import os
 import unicodedata
 import string
 
@@ -85,7 +92,7 @@
 
 def findFiles(path): return glob.glob(path)
 
-# Turn a Unicode string to plain ASCII, thanks to http://stackoverflow.com/a/518232/2809427
+# Turn a Unicode string to plain ASCII, thanks to https://stackoverflow.com/a/518232/2809427
 def unicodeToAscii(s):
     return ''.join(
         c for c in unicodedata.normalize('NFD', s)
@@ -95,20 +102,25 @@ def unicodeToAscii(s):
 
 # Read a file and split into lines
 def readLines(filename):
-    lines = open(filename, encoding='utf-8').read().strip().split('\n')
-    return [unicodeToAscii(line) for line in lines]
+    with open(filename, encoding='utf-8') as some_file:
+        return [unicodeToAscii(line.strip()) for line in some_file]
 
 # Build the category_lines dictionary, a list of lines per category
 category_lines = {}
 all_categories = []
 for filename in findFiles('data/names/*.txt'):
-    category = filename.split('/')[-1].split('.')[0]
+    category = os.path.splitext(os.path.basename(filename))[0]
     all_categories.append(category)
     lines = readLines(filename)
     category_lines[category] = lines
 
 n_categories = len(all_categories)
 
+if n_categories == 0:
+    raise RuntimeError('Data not found. Make sure that you downloaded data '
+        'from https://download.pytorch.org/tutorial/data.zip and extract it to '
+        'the current directory.')
+
 print('# categories:', n_categories, all_categories)
 print(unicodeToAscii("O'Néàl"))
 
@@ -116,16 +128,16 @@ def readLines(filename):
 ######################################################################
 # Creating the Network
 # ====================
-# 
+#
 # This network extends `the last tutorial's RNN <#Creating-the-Network>`__
 # with an extra argument for the category tensor, which is concatenated
 # along with the others. The category tensor is a one-hot vector just like
 # the letter input.
-# 
+#
 # We will interpret the output as the probability of the next letter. When
 # sampling, the most likely output letter is used as the next input
 # letter.
-# 
+#
 # I added a second linear layer ``o2o`` (after combining hidden and
 # output) to give it more muscle to work with. There's also a dropout
 # layer, which `randomly zeros parts of its
@@ -133,27 +145,26 @@ def readLines(filename):
 # (here 0.1) and is usually used to fuzz inputs to prevent overfitting.
 # Here we're using it towards the end of the network to purposely add some
 # chaos and increase sampling variety.
-# 
+#
 # .. figure:: https://i.imgur.com/jzVrf7f.png
-#    :alt: 
-# 
-# 
+#    :alt:
+#
+#
 
 import torch
 import torch.nn as nn
-from torch.autograd import Variable
 
 class RNN(nn.Module):
     def __init__(self, input_size, hidden_size, output_size):
         super(RNN, self).__init__()
         self.hidden_size = hidden_size
-        
+
         self.i2h = nn.Linear(n_categories + input_size + hidden_size, hidden_size)
         self.i2o = nn.Linear(n_categories + input_size + hidden_size, output_size)
         self.o2o = nn.Linear(hidden_size + output_size, output_size)
         self.dropout = nn.Dropout(0.1)
-        self.softmax = nn.LogSoftmax()
-    
+        self.softmax = nn.LogSoftmax(dim=1)
+
     def forward(self, category, input, hidden):
         input_combined = torch.cat((category, input, hidden), 1)
         hidden = self.i2h(input_combined)
@@ -165,7 +176,7 @@ def forward(self, category, input, hidden):
         return output, hidden
 
     def initHidden(self):
-        return Variable(torch.zeros(1, self.hidden_size))
+        return torch.zeros(1, self.hidden_size)
 
 
 ######################################################################
@@ -173,9 +184,9 @@ def initHidden(self):
 # =========
 # Preparing for Training
 # ----------------------
-# 
+#
 # First of all, helper functions to get random pairs of (category, line):
-# 
+#
 
 import random
 
@@ -197,21 +208,21 @@ def randomTrainingPair():
 # ``(next letter, next hidden state)``. So for each training set, we'll
 # need the category, a set of input letters, and a set of output/target
 # letters.
-# 
+#
 # Since we are predicting the next letter from the current letter for each
 # timestep, the letter pairs are groups of consecutive letters from the
 # line - e.g. for ``"ABCD<EOS>"`` we would create ("A", "B"), ("B", "C"),
 # ("C", "D"), ("D", "EOS").
-# 
+#
 # .. figure:: https://i.imgur.com/JH58tXY.png
-#    :alt: 
-# 
+#    :alt:
+#
 # The category tensor is a `one-hot
 # tensor <https://en.wikipedia.org/wiki/One-hot>`__ of size
 # ``<1 x n_categories>``. When training we feed it to the network at every
 # timestep - this is a design choice, it could have been included as part
 # of initial hidden state or some other strategy.
-# 
+#
 
 # One-hot vector for category
 def categoryTensor(category):
@@ -228,7 +239,7 @@ def inputTensor(line):
         tensor[li][0][all_letters.find(letter)] = 1
     return tensor
 
-# LongTensor of second letter to end (EOS) for target
+# ``LongTensor`` of second letter to end (EOS) for target
 def targetTensor(line):
     letter_indexes = [all_letters.find(line[li]) for li in range(1, len(line))]
     letter_indexes.append(n_letters - 1) # EOS
@@ -239,56 +250,58 @@ def targetTensor(line):
 # For convenience during training we'll make a ``randomTrainingExample``
 # function that fetches a random (category, line) pair and turns them into
 # the required (category, input, target) tensors.
-# 
+#
 
 # Make category, input, and target tensors from a random category, line pair
 def randomTrainingExample():
     category, line = randomTrainingPair()
-    category_tensor = Variable(categoryTensor(category))
-    input_line_tensor = Variable(inputTensor(line))
-    target_line_tensor = Variable(targetTensor(line))
+    category_tensor = categoryTensor(category)
+    input_line_tensor = inputTensor(line)
+    target_line_tensor = targetTensor(line)
     return category_tensor, input_line_tensor, target_line_tensor
 
 
 ######################################################################
 # Training the Network
 # --------------------
-# 
+#
 # In contrast to classification, where only the last output is used, we
 # are making a prediction at every step, so we are calculating loss at
 # every step.
-# 
+#
 # The magic of autograd allows you to simply sum these losses at each step
 # and call backward at the end.
-# 
+#
 
 criterion = nn.NLLLoss()
 
 learning_rate = 0.0005
 
 def train(category_tensor, input_line_tensor, target_line_tensor):
+    target_line_tensor.unsqueeze_(-1)
     hidden = rnn.initHidden()
 
     rnn.zero_grad()
-    
-    loss = 0
-    
-    for i in range(input_line_tensor.size()[0]):
+
+    loss = torch.Tensor([0]) # you can also just simply use ``loss = 0``
+
+    for i in range(input_line_tensor.size(0)):
         output, hidden = rnn(category_tensor, input_line_tensor[i], hidden)
-        loss += criterion(output, target_line_tensor[i])
+        l = criterion(output, target_line_tensor[i])
+        loss += l
 
     loss.backward()
 
     for p in rnn.parameters():
-        p.data.add_(-learning_rate, p.grad.data)
+        p.data.add_(p.grad.data, alpha=-learning_rate)
 
-    return output, loss.data[0] / input_line_tensor.size()[0]
+    return output, loss.item() / input_line_tensor.size(0)
 
 
 ######################################################################
 # To keep track of how long training takes I am adding a
 # ``timeSince(timestamp)`` function which returns a human readable string:
-# 
+#
 
 import time
 import math
@@ -306,7 +319,7 @@ def timeSince(since):
 # few minutes, printing the current time and loss every ``print_every``
 # examples, and keeping store of an average loss per ``plot_every`` examples
 # in ``all_losses`` for plotting later.
-# 
+#
 
 rnn = RNN(n_letters, 128, n_letters)
 
@@ -314,14 +327,14 @@ def timeSince(since):
 print_every = 5000
 plot_every = 500
 all_losses = []
-total_loss = 0 # Reset every plot_every iters
+total_loss = 0 # Reset every ``plot_every`` ``iters``
 
 start = time.time()
 
 for iter in range(1, n_iters + 1):
     output, loss = train(*randomTrainingExample())
     total_loss += loss
-    
+
     if iter % print_every == 0:
         print('%s (%d %d%%) %.4f' % (timeSince(start), iter, iter / n_iters * 100, loss))
 
@@ -333,13 +346,12 @@ def timeSince(since):
 ######################################################################
 # Plotting the Losses
 # -------------------
-# 
+#
 # Plotting the historical loss from all\_losses shows the network
 # learning:
-# 
+#
 
 import matplotlib.pyplot as plt
-import matplotlib.ticker as ticker
 
 plt.figure()
 plt.plot(all_losses)
@@ -348,50 +360,51 @@ def timeSince(since):
 ######################################################################
 # Sampling the Network
 # ====================
-# 
+#
 # To sample we give the network a letter and ask what the next one is,
 # feed that in as the next letter, and repeat until the EOS token.
-# 
+#
 # -  Create tensors for input category, starting letter, and empty hidden
 #    state
 # -  Create a string ``output_name`` with the starting letter
 # -  Up to a maximum output length,
-# 
+#
 #    -  Feed the current letter to the network
 #    -  Get the next letter from highest output, and next hidden state
 #    -  If the letter is EOS, stop here
 #    -  If a regular letter, add to ``output_name`` and continue
-# 
+#
 # -  Return the final name
-# 
-# .. Note::  
+#
+# .. note::
 #    Rather than having to give it a starting letter, another
 #    strategy would have been to include a "start of string" token in
 #    training and have the network choose its own starting letter.
-# 
+#
 
 max_length = 20
 
 # Sample from a category and starting letter
 def sample(category, start_letter='A'):
-    category_tensor = Variable(categoryTensor(category))
-    input = Variable(inputTensor(start_letter))
-    hidden = rnn.initHidden()
-
-    output_name = start_letter
-    
-    for i in range(max_length):
-        output, hidden = rnn(category_tensor, input[0], hidden)
-        topv, topi = output.data.topk(1)
-        topi = topi[0][0]
-        if topi == n_letters - 1:
-            break
-        else:    
-            letter = all_letters[topi]
-            output_name += letter
-        input = Variable(inputTensor(letter))
-
-    return output_name
+    with torch.no_grad():  # no need to track history in sampling
+        category_tensor = categoryTensor(category)
+        input = inputTensor(start_letter)
+        hidden = rnn.initHidden()
+
+        output_name = start_letter
+
+        for i in range(max_length):
+            output, hidden = rnn(category_tensor, input[0], hidden)
+            topv, topi = output.topk(1)
+            topi = topi[0][0]
+            if topi == n_letters - 1:
+                break
+            else:
+                letter = all_letters[topi]
+                output_name += letter
+            input = inputTensor(letter)
+
+        return output_name
 
 # Get multiple samples from one category and multiple starting letters
 def samples(category, start_letters='ABC'):
@@ -410,17 +423,17 @@ def samples(category, start_letters='ABC'):
 ######################################################################
 # Exercises
 # =========
-# 
+#
 # -  Try with a different dataset of category -> line, for example:
-# 
+#
 #    -  Fictional series -> Character name
 #    -  Part of speech -> Word
 #    -  Country -> City
-# 
+#
 # -  Use a "start of sentence" token so that sampling can be done without
 #    choosing a start letter
 # -  Get better results with a bigger and/or better shaped network
-# 
-#    -  Try the nn.LSTM and nn.GRU layers
+#
+#    -  Try the ``nn.LSTM`` and ``nn.GRU`` layers
 #    -  Combine multiple of these RNNs as a higher level network
-# 
+#
diff --git a/intermediate_source/compiled_autograd_tutorial.rst b/intermediate_source/compiled_autograd_tutorial.rst
new file mode 100644
index 00000000000..1091b19a49e
--- /dev/null
+++ b/intermediate_source/compiled_autograd_tutorial.rst
@@ -0,0 +1,221 @@
+Compiled Autograd: Capturing a larger backward graph for ``torch.compile``
+==========================================================================
+**Author:** `Simon Fan <https://github.com/xmfan>`_
+
+.. grid:: 2
+
+    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn
+       :class-card: card-prerequisites
+
+       * How compiled autograd interacts with ``torch.compile``
+       * How to use the compiled autograd API
+       * How to inspect logs using ``TORCH_LOGS``
+
+    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites
+       :class-card: card-prerequisites
+
+       * PyTorch 2.4
+       * Complete the `Introduction to torch.compile <https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html>`_
+       * Read through the TorchDynamo and AOTAutograd sections of `Get Started with PyTorch 2.x <https://pytorch.org/get-started/pytorch-2.0/>`_
+
+Overview
+--------
+Compiled Autograd is a ``torch.compile`` extension introduced in PyTorch 2.4
+that allows the capture of a larger backward graph.
+
+While ``torch.compile`` does capture the backward graph, it does so **partially**. The AOTAutograd component captures the backward graph ahead-of-time, with certain limitations:
+
+* Graph breaks in the forward lead to graph breaks in the backward
+* `Backward hooks <https://pytorch.org/docs/stable/notes/autograd.html#backward-hooks-execution>`_ are not captured
+
+Compiled Autograd addresses these limitations by directly integrating with the autograd engine, allowing
+it to capture the full backward graph at runtime. Models with these two characteristics should try
+Compiled Autograd, and potentially observe better performance.
+
+However, Compiled Autograd introduces its own limitations:
+
+* Added runtime overhead at the start of the backward for cache lookup
+* More prone to recompiles and graph breaks in dynamo due to the larger capture
+
+.. note:: Compiled Autograd is under active development and is not yet compatible with all existing PyTorch features. For the latest status on a particular feature, refer to `Compiled Autograd Landing Page <https://docs.google.com/document/d/11VucFBEewzqgkABIjebZIzMvrXr3BtcY1aGKpX61pJY>`_.
+
+Setup
+-----
+In this tutorial, we will base our examples on this simple neural network model.
+It takes a 10-dimensional input vector, processes it through a single linear layer, and outputs another 10-dimensional vector.
+
+.. code:: python
+
+   import torch
+
+   class Model(torch.nn.Module):
+      def __init__(self):
+         super().__init__()
+         self.linear = torch.nn.Linear(10, 10)
+
+      def forward(self, x):
+         return self.linear(x)
+
+Basic usage
+------------
+Before calling the ``torch.compile`` API, make sure to set ``torch._dynamo.config.compiled_autograd`` to ``True``:
+
+.. code:: python
+
+   model = Model()
+   x = torch.randn(10)
+
+   torch._dynamo.config.compiled_autograd = True
+   @torch.compile
+   def train(model, x):
+      loss = model(x).sum()
+      loss.backward()
+
+   train(model, x) 
+
+In the code above, we create an instance of the ``Model`` class and generate a random 10-dimensional tensor ``x`` by using ``torch.randn(10)``.
+We define the training loop function ``train`` and decorate it with @torch.compile to optimize its execution.
+When ``train(model, x)`` is called:
+
+* Python Interpreter calls Dynamo, since this call was decorated with ``@torch.compile``.
+* Dynamo intercepts the Python bytecode, simulates their execution and records the operations into a graph.
+* ``AOTDispatcher`` disables hooks and calls the autograd engine to compute gradients for ``model.linear.weight`` and ``model.linear.bias``, and records the operations into a graph. Using ``torch.autograd.Function``, AOTDispatcher rewrites the forward and backward implementation of ``train``.
+* Inductor generates a function corresponding to an optimized implementation of the AOTDispatcher forward and backward.
+* Dynamo sets the optimized function to be evaluated next by Python Interpreter.
+* Python Interpreter executes the optimized function, which executes ``loss = model(x).sum()``.
+* Python Interpreter executes ``loss.backward()``, calling into the autograd engine, which routes to the Compiled Autograd engine since we set ``torch._dynamo.config.compiled_autograd = True``.
+* Compiled Autograd computes the gradients for ``model.linear.weight`` and ``model.linear.bias``, and records the operations into a graph, including any hooks it encounters. During this process, it will record the backward previously rewritten by AOTDispatcher. Compiled Autograd then generates a new function which corresponds to a fully-traced implementation of ``loss.backward()``, and executes it with ``torch.compile`` in inference mode.
+* The same steps recursively apply to the Compiled Autograd graph, but this time AOTDispatcher will not need to partition the graph.
+
+Inspecting the compiled autograd logs
+-------------------------------------
+Run the script with the ``TORCH_LOGS`` environment variables:
+
+* To only print the compiled autograd graph, use ``TORCH_LOGS="compiled_autograd" python example.py``
+* To print the graph with more tensor metadata and recompile reasons, at the cost of performance, use ``TORCH_LOGS="compiled_autograd_verbose" python example.py``
+
+Rerun the snippet above, the compiled autograd graph should now be logged to ``stderr``. Certain graph nodes will have names that are prefixed by ``aot0_``,
+these correspond to the nodes previously compiled ahead of time in AOTAutograd backward graph 0, for example, ``aot0_view_2`` corresponds to ``view_2`` of the AOT backward graph with id=0.
+
+In the image below, the red box encapsulates the AOT backward graph that is captured by ``torch.compile`` without Compiled Autograd.
+
+
+.. image:: ../_static/img/compiled_autograd/entire_verbose_log.png
+
+.. note:: This is the graph on which we will call ``torch.compile``, **NOT** the optimized graph. Compiled Autograd essentially generates some unoptimized Python code to represent the entire C++ autograd execution.
+
+Compiling the forward and backward pass using different flags
+-------------------------------------------------------------
+You can use different compiler configs for the two compilations, for example, the backward may be a fullgraph even if there are graph breaks in the forward.
+
+.. code:: python
+
+   def train(model, x):
+       model = torch.compile(model)
+       loss = model(x).sum()
+       torch._dynamo.config.compiled_autograd = True
+       torch.compile(lambda: loss.backward(), fullgraph=True)()
+
+Or you can use the context manager, which will apply to all autograd calls within its scope.
+
+.. code:: python
+
+   def train(model, x):
+      model = torch.compile(model)
+      loss = model(x).sum()
+      with torch._dynamo.compiled_autograd.enable(torch.compile(fullgraph=True)):
+         loss.backward()
+
+
+Compiled Autograd addresses certain limitations of AOTAutograd
+--------------------------------------------------------------
+1. Graph breaks in the forward pass no longer necessarily lead to graph breaks in the backward pass:
+
+.. code:: python
+
+   @torch.compile(backend="aot_eager")
+   def fn(x):
+      # 1st graph
+      temp = x + 10
+      torch._dynamo.graph_break()
+      # 2nd graph
+      temp = temp + 10
+      torch._dynamo.graph_break()
+      # 3rd graph
+      return temp.sum()
+
+   x = torch.randn(10, 10, requires_grad=True)
+   torch._dynamo.utils.counters.clear()
+   loss = fn(x)
+
+   # 1. base torch.compile 
+   loss.backward(retain_graph=True)
+   assert(torch._dynamo.utils.counters["stats"]["unique_graphs"] == 3)
+   torch._dynamo.utils.counters.clear()
+
+   # 2. torch.compile with compiled autograd
+   with torch._dynamo.compiled_autograd.enable(torch.compile(backend="aot_eager")):
+      loss.backward()
+
+   # single graph for the backward
+   assert(torch._dynamo.utils.counters["stats"]["unique_graphs"] == 1)
+
+
+In the first ``torch.compile`` case, we see that 3 backward graphs were produced due to the 2 graph breaks in the compiled function ``fn``. 
+Whereas in the second ``torch.compile`` with compiled autograd case, we see that a full backward graph was traced despite the graph breaks.
+
+.. note:: It is still possible for the Dynamo to graph break when tracing backward hooks captured by Compiled Autograd.
+
+
+2. Backward hooks can now be captured
+
+.. code:: python
+
+   @torch.compile(backend="aot_eager")
+   def fn(x):
+      return x.sum()
+
+   x = torch.randn(10, 10, requires_grad=True)
+   x.register_hook(lambda grad: grad+10)
+   loss = fn(x)
+
+   with torch._dynamo.compiled_autograd.enable(torch.compile(backend="aot_eager")):
+      loss.backward()
+
+There should be a ``call_hook`` node in the graph, which dynamo will later inline into the following:
+
+.. image:: ../_static/img/compiled_autograd/call_hook_node.png
+
+Common recompilation reasons for Compiled Autograd
+--------------------------------------------------
+1. Due to changes in the autograd structure of the loss value:
+
+.. code:: python
+
+   torch._dynamo.config.compiled_autograd = True
+   x = torch.randn(10, requires_grad=True)
+   for op in [torch.add, torch.sub, torch.mul, torch.div]:
+      loss = op(x, x).sum()
+      torch.compile(lambda: loss.backward(), backend="eager")()
+
+In the example above, we call a different operator on each iteration, leading to ``loss`` tracking a different autograd history each time. You should see some recompile messages: **Cache miss due to new autograd node**.
+
+.. image:: ../_static/img/compiled_autograd/recompile_due_to_node.png
+
+2. Due to tensors changing shapes:
+
+.. code:: python
+
+   torch._dynamo.config.compiled_autograd = True
+   for i in [10, 100, 10]:
+      x = torch.randn(i, i, requires_grad=True)
+      loss = x.sum()
+      torch.compile(lambda: loss.backward(), backend="eager")()
+
+In the example above, ``x`` changes shapes, and compiled autograd will mark ``x`` as a dynamic shape tensor after the first change. You should see recompiles messages: **Cache miss due to changed shapes**.
+
+.. image:: ../_static/img/compiled_autograd/recompile_due_to_dynamic.png
+
+Conclusion
+----------
+In this tutorial, we went over the high-level ecosystem of ``torch.compile`` with compiled autograd, the basics of compiled autograd and a few common recompilation reasons. Stay tuned for deep dives on `dev-discuss <https://dev-discuss.pytorch.org/>`_.
diff --git a/intermediate_source/custom_function_conv_bn_tutorial.py b/intermediate_source/custom_function_conv_bn_tutorial.py
new file mode 100644
index 00000000000..a9fcd8838ae
--- /dev/null
+++ b/intermediate_source/custom_function_conv_bn_tutorial.py
@@ -0,0 +1,394 @@
+# -*- coding: utf-8 -*-
+"""
+Fusing Convolution and Batch Norm using Custom Function
+=======================================================
+
+Fusing adjacent convolution and batch norm layers together is typically an
+inference-time optimization to improve run-time. It is usually achieved
+by eliminating the batch norm layer entirely and updating the weight
+and bias of the preceding convolution [0]. However, this technique is not
+applicable for training models.
+
+In this tutorial, we will show a different technique to fuse the two layers
+that can be applied during training. Rather than improved runtime, the
+objective of this optimization is to reduce memory usage.
+
+The idea behind this optimization is to see that both convolution and
+batch norm (as well as many other ops) need to save a copy of their input
+during forward for the backward pass. For large
+batch sizes, these saved inputs are responsible for most of your memory usage,
+so being able to avoid allocating another input tensor for every
+convolution batch norm pair can be a significant reduction.
+
+In this tutorial, we avoid this extra allocation by combining convolution
+and batch norm into a single layer (as a custom function). In the forward
+of this combined layer, we perform normal convolution and batch norm as-is,
+with the only difference being that we will only save the inputs to the convolution.
+To obtain the input of batch norm, which is necessary to backward through
+it, we recompute convolution forward again during the backward pass.
+
+It is important to note that the usage of this optimization is situational.
+Though (by avoiding one buffer saved) we always reduce the memory allocated at
+the end of the forward pass, there are cases when the *peak* memory allocated
+may not actually be reduced. See the final section for more details.
+
+For simplicity, in this tutorial we hardcode `bias=False`, `stride=1`, `padding=0`, `dilation=1`,
+and `groups=1` for Conv2D. For BatchNorm2D, we hardcode `eps=1e-3`, `momentum=0.1`,
+`affine=False`, and `track_running_statistics=False`. Another small difference
+is that we add epsilon in the denominator outside of the square root in the computation
+of batch norm.
+
+[0] https://nenadmarkus.com/p/fusing-batchnorm-and-conv/
+"""
+
+######################################################################
+# Backward Formula Implementation for Convolution
+# -------------------------------------------------------------------
+# Implementing a custom function requires us to implement the backward
+# ourselves. In this case, we need both the backward formulas for Conv2D
+# and BatchNorm2D. Eventually we'd chain them together in our unified
+# backward function, but below we first implement them as their own
+# custom functions so we can validate their correctness individually
+import torch
+from torch.autograd.function import once_differentiable
+import torch.nn.functional as F
+
+def convolution_backward(grad_out, X, weight):
+    grad_input = F.conv2d(X.transpose(0, 1), grad_out.transpose(0, 1)).transpose(0, 1)
+    grad_X = F.conv_transpose2d(grad_out, weight)
+    return grad_X, grad_input
+
+class Conv2D(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, X, weight):
+        ctx.save_for_backward(X, weight)
+        return F.conv2d(X, weight)
+
+    # Use @once_differentiable by default unless we intend to double backward
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_out):
+        X, weight = ctx.saved_tensors
+        return convolution_backward(grad_out, X, weight)
+
+######################################################################
+# When testing with ``gradcheck``, it is important to use double precision
+weight = torch.rand(5, 3, 3, 3, requires_grad=True, dtype=torch.double)
+X = torch.rand(10, 3, 7, 7, requires_grad=True, dtype=torch.double)
+torch.autograd.gradcheck(Conv2D.apply, (X, weight))
+
+######################################################################
+# Backward Formula Implementation for Batch Norm
+# -------------------------------------------------------------------
+# Batch Norm has two modes: training and ``eval`` mode. In training mode
+# the sample statistics are a function of the inputs. In ``eval`` mode,
+# we use the saved running statistics, which are not a function of the inputs.
+# This makes non-training mode's backward significantly simpler. Below
+# we implement and test only the training mode case.
+def unsqueeze_all(t):
+    # Helper function to ``unsqueeze`` all the dimensions that we reduce over
+    return t[None, :, None, None]
+
+def batch_norm_backward(grad_out, X, sum, sqrt_var, N, eps):
+    # We use the formula: ``out = (X - mean(X)) / (sqrt(var(X)) + eps)``
+    # in batch norm 2D forward. To simplify our derivation, we follow the
+    # chain rule and compute the gradients as follows before accumulating
+    # them all into a final grad_input.
+    #  1) ``grad of out wrt var(X)`` * ``grad of var(X) wrt X``
+    #  2) ``grad of out wrt mean(X)`` * ``grad of mean(X) wrt X``
+    #  3) ``grad of out wrt X in the numerator`` * ``grad of X wrt X``
+    # We then rewrite the formulas to use as few extra buffers as possible
+    tmp = ((X - unsqueeze_all(sum) / N) * grad_out).sum(dim=(0, 2, 3))
+    tmp *= -1
+    d_denom = tmp / (sqrt_var + eps)**2  # ``d_denom = -num / denom**2``
+    # It is useful to delete tensors when you no longer need them with ``del``
+    # For example, we could've done ``del tmp`` here because we won't use it later
+    # In this case, it's not a big difference because ``tmp`` only has size of (C,)
+    # The important thing is avoid allocating NCHW-sized tensors unnecessarily
+    d_var = d_denom / (2 * sqrt_var)  # ``denom = torch.sqrt(var) + eps``
+    # Compute ``d_mean_dx`` before allocating the final NCHW-sized grad_input buffer
+    d_mean_dx = grad_out / unsqueeze_all(sqrt_var + eps)
+    d_mean_dx = unsqueeze_all(-d_mean_dx.sum(dim=(0, 2, 3)) / N)
+    # ``d_mean_dx`` has already been reassigned to a C-sized buffer so no need to worry
+
+    # ``(1) unbiased_var(x) = ((X - unsqueeze_all(mean))**2).sum(dim=(0, 2, 3)) / (N - 1)``
+    grad_input = X * unsqueeze_all(d_var * N)
+    grad_input += unsqueeze_all(-d_var * sum)
+    grad_input *= 2 / ((N - 1) * N)
+    # (2) mean (see above)
+    grad_input += d_mean_dx
+    # (3) Add 'grad_out / <factor>' without allocating an extra buffer
+    grad_input *= unsqueeze_all(sqrt_var + eps)
+    grad_input += grad_out
+    grad_input /= unsqueeze_all(sqrt_var + eps)  # ``sqrt_var + eps > 0!``
+    return grad_input
+
+class BatchNorm(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, X, eps=1e-3):
+        # Don't save ``keepdim`` values for backward
+        sum = X.sum(dim=(0, 2, 3))
+        var = X.var(unbiased=True, dim=(0, 2, 3))
+        N = X.numel() / X.size(1)
+        sqrt_var = torch.sqrt(var)
+        ctx.save_for_backward(X)
+        ctx.eps = eps
+        ctx.sum = sum
+        ctx.N = N
+        ctx.sqrt_var = sqrt_var
+        mean = sum / N
+        denom = sqrt_var + eps
+        out = X - unsqueeze_all(mean)
+        out /= unsqueeze_all(denom)
+        return out
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_out):
+        X, = ctx.saved_tensors
+        return batch_norm_backward(grad_out, X, ctx.sum, ctx.sqrt_var, ctx.N, ctx.eps)
+
+######################################################################
+# Testing with ``gradcheck``
+a = torch.rand(1, 2, 3, 4, requires_grad=True, dtype=torch.double)
+torch.autograd.gradcheck(BatchNorm.apply, (a,), fast_mode=False)
+
+######################################################################
+# Fusing Convolution and BatchNorm
+# -------------------------------------------------------------------
+# Now that the bulk of the work has been done, we can combine
+# them together. Note that in (1) we only save a single buffer
+# for backward, but this also means we recompute convolution forward
+# in (5). Also see that in (2), (3), (4), and (6), it's the same
+# exact code as the examples above.
+class FusedConvBN2DFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, X, conv_weight, eps=1e-3):
+        assert X.ndim == 4  # N, C, H, W
+        # (1) Only need to save this single buffer for backward!
+        ctx.save_for_backward(X, conv_weight)
+
+        # (2) Exact same Conv2D forward from example above
+        X = F.conv2d(X, conv_weight)
+        # (3) Exact same BatchNorm2D forward from example above
+        sum = X.sum(dim=(0, 2, 3))
+        var = X.var(unbiased=True, dim=(0, 2, 3))
+        N = X.numel() / X.size(1)
+        sqrt_var = torch.sqrt(var)
+        ctx.eps = eps
+        ctx.sum = sum
+        ctx.N = N
+        ctx.sqrt_var = sqrt_var
+        mean = sum / N
+        denom = sqrt_var + eps
+        # Try to do as many things in-place as possible
+        # Instead of `out = (X - a) / b`, doing `out = X - a; out /= b`
+        # avoids allocating one extra NCHW-sized buffer here
+        out = X - unsqueeze_all(mean)
+        out /= unsqueeze_all(denom)
+        return out
+
+    @staticmethod
+    def backward(ctx, grad_out):
+        X, conv_weight, = ctx.saved_tensors
+        # (4) Batch norm backward
+        # (5) We need to recompute conv
+        X_conv_out = F.conv2d(X, conv_weight)
+        grad_out = batch_norm_backward(grad_out, X_conv_out, ctx.sum, ctx.sqrt_var,
+                                       ctx.N, ctx.eps)
+        # (6) Conv2d backward
+        grad_X, grad_input = convolution_backward(grad_out, X, conv_weight)
+        return grad_X, grad_input, None, None, None, None, None
+
+######################################################################
+# The next step is to wrap our functional variant in a stateful
+# `nn.Module`
+import torch.nn as nn
+import math
+
+class FusedConvBN(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, exp_avg_factor=0.1,
+                 eps=1e-3, device=None, dtype=None):
+        super(FusedConvBN, self).__init__()
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        # Conv parameters
+        weight_shape = (out_channels, in_channels, kernel_size, kernel_size)
+        self.conv_weight = nn.Parameter(torch.empty(*weight_shape, **factory_kwargs))
+        # Batch norm parameters
+        num_features = out_channels
+        self.num_features = num_features
+        self.eps = eps
+        # Initialize
+        self.reset_parameters()
+
+    def forward(self, X):
+        return FusedConvBN2DFunction.apply(X, self.conv_weight, self.eps)
+
+    def reset_parameters(self) -> None:
+        nn.init.kaiming_uniform_(self.conv_weight, a=math.sqrt(5))
+
+######################################################################
+# Use ``gradcheck`` to validate the correctness of our backward formula
+weight = torch.rand(5, 3, 3, 3, requires_grad=True, dtype=torch.double)
+X = torch.rand(2, 3, 4, 4, requires_grad=True, dtype=torch.double)
+torch.autograd.gradcheck(FusedConvBN2DFunction.apply, (X, weight))
+
+######################################################################
+# Testing out our new Layer
+# -------------------------------------------------------------------
+# Use ``FusedConvBN`` to train a basic network
+# The code below is after some light modifications to the example here:
+# https://github.com/pytorch/examples/tree/master/mnist
+import torch.optim as optim
+from torchvision import datasets, transforms
+from torch.optim.lr_scheduler import StepLR
+
+# Record memory allocated at the end of the forward pass
+memory_allocated = [[],[]]
+
+class Net(nn.Module):
+    def __init__(self, fused=True):
+        super(Net, self).__init__()
+        self.fused = fused
+        if fused:
+            self.convbn1 = FusedConvBN(1, 32, 3)
+            self.convbn2 = FusedConvBN(32, 64, 3)
+        else:
+            self.conv1 = nn.Conv2d(1, 32, 3, 1, bias=False)
+            self.bn1 = nn.BatchNorm2d(32, affine=False, track_running_stats=False)
+            self.conv2 = nn.Conv2d(32, 64, 3, 1, bias=False)
+            self.bn2 = nn.BatchNorm2d(64, affine=False, track_running_stats=False)
+        self.fc1 = nn.Linear(9216, 128)
+        self.dropout = nn.Dropout(0.5)
+        self.fc2 = nn.Linear(128, 10)
+
+    def forward(self, x):
+        if self.fused:
+            x = self.convbn1(x)
+        else:
+            x = self.conv1(x)
+            x = self.bn1(x)
+        F.relu_(x)
+        if self.fused:
+            x = self.convbn2(x)
+        else:
+            x = self.conv2(x)
+            x = self.bn2(x)
+        F.relu_(x)
+        x = F.max_pool2d(x, 2)
+        F.relu_(x)
+        x = x.flatten(1)
+        x = self.fc1(x)
+        x = self.dropout(x)
+        F.relu_(x)
+        x = self.fc2(x)
+        output = F.log_softmax(x, dim=1)
+        if fused:
+            memory_allocated[0].append(torch.cuda.memory_allocated())
+        else:
+            memory_allocated[1].append(torch.cuda.memory_allocated())
+        return output
+
+def train(model, device, train_loader, optimizer, epoch):
+    model.train()
+    for batch_idx, (data, target) in enumerate(train_loader):
+        data, target = data.to(device), target.to(device)
+        optimizer.zero_grad()
+        output = model(data)
+        loss = F.nll_loss(output, target)
+        loss.backward()
+        optimizer.step()
+        if batch_idx % 2 == 0:
+            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
+                epoch, batch_idx * len(data), len(train_loader.dataset),
+                100. * batch_idx / len(train_loader), loss.item()))
+
+def test(model, device, test_loader):
+    model.eval()
+    test_loss = 0
+    correct = 0
+    # Use inference mode instead of no_grad, for free improved test-time performance
+    with torch.inference_mode():
+        for data, target in test_loader:
+            data, target = data.to(device), target.to(device)
+            output = model(data)
+            # sum up batch loss
+            test_loss += F.nll_loss(output, target, reduction='sum').item()
+            # get the index of the max log-probability
+            pred = output.argmax(dim=1, keepdim=True)
+            correct += pred.eq(target.view_as(pred)).sum().item()
+
+    test_loss /= len(test_loader.dataset)
+
+    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
+        test_loss, correct, len(test_loader.dataset),
+        100. * correct / len(test_loader.dataset)))
+
+use_cuda = torch.cuda.is_available()
+device = torch.device("cuda" if use_cuda else "cpu")
+train_kwargs = {'batch_size': 2048}
+test_kwargs = {'batch_size': 2048}
+
+if use_cuda:
+    cuda_kwargs = {'num_workers': 1,
+                   'pin_memory': True,
+                   'shuffle': True}
+    train_kwargs.update(cuda_kwargs)
+    test_kwargs.update(cuda_kwargs)
+
+transform = transforms.Compose([
+    transforms.ToTensor(),
+    transforms.Normalize((0.1307,), (0.3081,))
+])
+dataset1 = datasets.MNIST('../data', train=True, download=True,
+                          transform=transform)
+dataset2 = datasets.MNIST('../data', train=False,
+                          transform=transform)
+train_loader = torch.utils.data.DataLoader(dataset1, **train_kwargs)
+test_loader = torch.utils.data.DataLoader(dataset2, **test_kwargs)
+
+######################################################################
+# A Comparison of Memory Usage
+# -------------------------------------------------------------------
+# If CUDA is enabled, print out memory usage for both `fused=True` and `fused=False`
+# For an example run on NVIDIA GeForce RTX 3070, NVIDIA CUDA® Deep Neural Network library (cuDNN) 8.0.5: fused peak memory: 1.56GB,
+# unfused peak memory: 2.68GB
+#
+# It is important to note that the *peak* memory usage for this model may vary depending
+# the specific cuDNN convolution algorithm used. For shallower models, it
+# may be possible for the peak memory allocated of the fused model to exceed
+# that of the unfused model! This is because the memory allocated to compute
+# certain cuDNN convolution algorithms can be high enough to "hide" the typical peak
+# you would expect to be near the start of the backward pass.
+#
+# For this reason, we also record and display the memory allocated at the end
+# of the forward pass as an approximation, and to demonstrate that we indeed
+# allocate one fewer buffer per fused ``conv-bn`` pair.
+from statistics import mean
+
+torch.backends.cudnn.enabled = True
+
+if use_cuda:
+    peak_memory_allocated = []
+
+    for fused in (True, False):
+        torch.manual_seed(123456)
+
+        model = Net(fused=fused).to(device)
+        optimizer = optim.Adadelta(model.parameters(), lr=1.0)
+        scheduler = StepLR(optimizer, step_size=1, gamma=0.7)
+
+        for epoch in range(1):
+            train(model, device, train_loader, optimizer, epoch)
+            test(model, device, test_loader)
+            scheduler.step()
+        peak_memory_allocated.append(torch.cuda.max_memory_allocated())
+        torch.cuda.reset_peak_memory_stats()
+    print("cuDNN version:", torch.backends.cudnn.version())
+    print()
+    print("Peak memory allocated:")
+    print(f"fused: {peak_memory_allocated[0]/1024**3:.2f}GB, unfused: {peak_memory_allocated[1]/1024**3:.2f}GB")
+    print("Memory allocated at end of forward pass:")
+    print(f"fused: {mean(memory_allocated[0])/1024**3:.2f}GB, unfused: {mean(memory_allocated[1])/1024**3:.2f}GB")
+
+
diff --git a/intermediate_source/custom_function_double_backward_tutorial.rst b/intermediate_source/custom_function_double_backward_tutorial.rst
new file mode 100644
index 00000000000..bbb7701f185
--- /dev/null
+++ b/intermediate_source/custom_function_double_backward_tutorial.rst
@@ -0,0 +1,301 @@
+Double Backward with Custom Functions
+=====================================
+
+It is sometimes useful to run backwards twice through backward graph, for
+example to compute higher-order gradients. It takes an understanding of
+autograd and some care to support double backwards, however. Functions
+that support performing backward a single time are not necessarily
+equipped to support double backward. In this tutorial we show how to
+write a custom autograd function that supports double backward, and
+point out some things to look out for.
+
+
+When writing a custom autograd function to backward through twice,
+it is important to know when operations performed in a custom function
+are recorded by autograd, when they aren't, and most importantly, how
+`save_for_backward` works with all of this.
+
+Custom functions implicitly affects grad mode in two ways:
+
+- During forward, autograd does not record any the graph for any
+  operations performed within the forward function. When forward
+  completes, the backward function of the custom function
+  becomes the `grad_fn` of each of the forward's outputs
+
+- During backward, autograd records the computation graph used to
+  compute the backward pass if create_graph is specified
+
+Next, to understand how `save_for_backward` interacts with the above,
+we can explore a couple examples:
+
+
+Saving the Inputs
+-------------------------------------------------------------------
+Consider this simple squaring function. It saves an input tensor
+for backward. Double backward works automatically when autograd
+is able to record operations in the backward pass, so there is usually
+nothing to worry about when we save an input for backward as
+the input should have grad_fn if it is a function of any tensor
+that requires grad. This allows the gradients to be properly propagated.
+
+.. code:: python
+
+    import torch
+
+    class Square(torch.autograd.Function):
+        @staticmethod
+        def forward(ctx, x):
+            # Because we are saving one of the inputs use `save_for_backward`
+            # Save non-tensors and non-inputs/non-outputs directly on ctx
+            ctx.save_for_backward(x)
+            return x**2
+
+        @staticmethod
+        def backward(ctx, grad_out):
+            # A function support double backward automatically if autograd
+            # is able to record the computations performed in backward
+            x, = ctx.saved_tensors
+            return grad_out * 2 * x
+
+    # Use double precision because finite differencing method magnifies errors
+    x = torch.rand(3, 3, requires_grad=True, dtype=torch.double)
+    torch.autograd.gradcheck(Square.apply, x)
+    # Use gradcheck to verify second-order derivatives
+    torch.autograd.gradgradcheck(Square.apply, x)
+
+
+We can use torchviz to visualize the graph to see why this works
+
+.. code-block:: python
+
+   import torchviz
+
+   x = torch.tensor(1., requires_grad=True).clone()
+   out = Square.apply(x)
+   grad_x, = torch.autograd.grad(out, x, create_graph=True)
+   torchviz.make_dot((grad_x, x, out), {"grad_x": grad_x, "x": x, "out": out})
+
+We can see that the gradient wrt to x, is itself a function of x (dout/dx = 2x)
+And the graph of this function has been properly constructed
+
+.. image:: https://user-images.githubusercontent.com/13428986/126559699-e04f3cb1-aaf2-4a9a-a83d-b8767d04fbd9.png
+   :width: 400
+
+
+Saving the Outputs
+-------------------------------------------------------------------
+A slight variation on the previous example is to save an output
+instead of input. The mechanics are similar because outputs are also
+associated with a grad_fn.
+
+.. code-block:: python
+
+    class Exp(torch.autograd.Function):
+        # Simple case where everything goes well
+        @staticmethod
+        def forward(ctx, x):
+            # This time we save the output
+            result = torch.exp(x)
+            # Note that we should use `save_for_backward` here when
+            # the tensor saved is an ouptut (or an input).
+            ctx.save_for_backward(result)
+            return result
+
+        @staticmethod
+        def backward(ctx, grad_out):
+            result, = ctx.saved_tensors
+            return result * grad_out
+
+    x = torch.tensor(1., requires_grad=True, dtype=torch.double).clone()
+    # Validate our gradients using gradcheck
+    torch.autograd.gradcheck(Exp.apply, x)
+    torch.autograd.gradgradcheck(Exp.apply, x)
+
+Use torchviz to visualize the graph:
+
+.. code-block:: python
+
+   out = Exp.apply(x)
+   grad_x, = torch.autograd.grad(out, x, create_graph=True)
+   torchviz.make_dot((grad_x, x, out), {"grad_x": grad_x, "x": x, "out": out})
+
+.. image:: https://user-images.githubusercontent.com/13428986/126559780-d141f2ba-1ee8-4c33-b4eb-c9877b27a954.png
+   :width: 332
+
+
+Saving Intermediate Results
+-------------------------------------------------------------------
+A more tricky case is when we need to save an intermediate result.
+We demonstrate this case by implementing:
+
+.. math::
+  sinh(x) := \frac{e^x - e^{-x}}{2}
+
+Since the derivative of sinh is cosh, it might be useful to reuse
+`exp(x)` and `exp(-x)`, the two intermediate results in forward
+in the backward computation.
+
+Intermediate results should not be directly saved and used in backward though.
+Because forward is performed in no-grad mode, if an intermediate result
+of the forward pass is used to compute gradients in the backward pass
+the backward graph of the gradients would not include the operations
+that computed the intermediate result. This leads to incorrect gradients.
+
+.. code-block:: python
+
+    class Sinh(torch.autograd.Function):
+        @staticmethod
+        def forward(ctx, x):
+            expx = torch.exp(x)
+            expnegx = torch.exp(-x)
+            ctx.save_for_backward(expx, expnegx)
+            # In order to be able to save the intermediate results, a trick is to
+            # include them as our outputs, so that the backward graph is constructed
+            return (expx - expnegx) / 2, expx, expnegx
+
+        @staticmethod
+        def backward(ctx, grad_out, _grad_out_exp, _grad_out_negexp):
+            expx, expnegx = ctx.saved_tensors
+            grad_input = grad_out * (expx + expnegx) / 2
+            # We cannot skip accumulating these even though we won't use the outputs
+            # directly. They will be used later in the second backward.
+            grad_input += _grad_out_exp * expx
+            grad_input -= _grad_out_negexp * expnegx
+            return grad_input
+
+    def sinh(x):
+        # Create a wrapper that only returns the first output
+        return Sinh.apply(x)[0]
+
+    x = torch.rand(3, 3, requires_grad=True, dtype=torch.double)
+    torch.autograd.gradcheck(sinh, x)
+    torch.autograd.gradgradcheck(sinh, x)
+
+
+Use torchviz to visualize the graph:
+
+.. code-block:: python
+
+   out = sinh(x)
+   grad_x, = torch.autograd.grad(out.sum(), x, create_graph=True)
+   torchviz.make_dot((grad_x, x, out), params={"grad_x": grad_x, "x": x, "out": out})
+
+.. image:: https://user-images.githubusercontent.com/13428986/126560494-e48eba62-be84-4b29-8c90-a7f6f40b1438.png
+   :width: 460
+
+
+Saving Intermediate Results: What not to do
+-------------------------------------------------------------------
+Now we show what happens when we don't also return our intermediate
+results as outputs: `grad_x` would not even have a  backward graph
+because it is purely a function `exp` and `expnegx`, which don't
+require grad.
+
+.. code-block:: python
+
+    class SinhBad(torch.autograd.Function):
+        # This is an example of what NOT to do!
+        @staticmethod
+        def forward(ctx, x):
+            expx = torch.exp(x)
+            expnegx = torch.exp(-x)
+            ctx.expx = expx
+            ctx.expnegx = expnegx
+            return (expx - expnegx) / 2
+
+        @staticmethod
+        def backward(ctx, grad_out):
+            expx = ctx.expx
+            expnegx = ctx.expnegx
+            grad_input = grad_out * (expx + expnegx) / 2
+            return grad_input
+
+
+Use torchviz to visualize the graph. Notice that `grad_x` is not
+part of the graph!
+
+.. code-block:: python
+
+   out = SinhBad.apply(x)
+   grad_x, = torch.autograd.grad(out.sum(), x, create_graph=True)
+   torchviz.make_dot((grad_x, x, out), params={"grad_x": grad_x, "x": x, "out": out})
+
+.. image:: https://user-images.githubusercontent.com/13428986/126565889-13992f01-55bc-411a-8aee-05b721fe064a.png
+   :width: 232
+
+
+
+When Backward is not Tracked
+-------------------------------------------------------------------
+Finally, let's consider an example when it may not be possible for
+autograd to track gradients for a functions backward at all.
+We can imagine cube_backward to be a function that may require a
+non-PyTorch library like SciPy or NumPy, or written as a
+C++ extension. The workaround demonstrated here is to create another
+custom function CubeBackward where you also manually specify the
+backward of cube_backward!
+
+
+.. code-block:: python
+
+    def cube_forward(x):
+        return x**3
+
+    def cube_backward(grad_out, x):
+        return grad_out * 3 * x**2
+
+    def cube_backward_backward(grad_out, sav_grad_out, x):
+        return grad_out * sav_grad_out * 6 * x
+
+    def cube_backward_backward_grad_out(grad_out, x):
+        return grad_out * 3 * x**2
+
+    class Cube(torch.autograd.Function):
+        @staticmethod
+        def forward(ctx, x):
+            ctx.save_for_backward(x)
+            return cube_forward(x)
+
+        @staticmethod
+        def backward(ctx, grad_out):
+            x, = ctx.saved_tensors
+            return CubeBackward.apply(grad_out, x)
+
+    class CubeBackward(torch.autograd.Function):
+        @staticmethod
+        def forward(ctx, grad_out, x):
+            ctx.save_for_backward(x, grad_out)
+            return cube_backward(grad_out, x)
+
+        @staticmethod
+        def backward(ctx, grad_out):
+            x, sav_grad_out = ctx.saved_tensors
+            dx = cube_backward_backward(grad_out, sav_grad_out, x)
+            dgrad_out = cube_backward_backward_grad_out(grad_out, x)
+            return dgrad_out, dx
+
+    x = torch.tensor(2., requires_grad=True, dtype=torch.double)
+
+    torch.autograd.gradcheck(Cube.apply, x)
+    torch.autograd.gradgradcheck(Cube.apply, x)
+
+
+Use torchviz to visualize the graph:
+
+.. code-block:: python
+
+   out = Cube.apply(x)
+   grad_x, = torch.autograd.grad(out, x, create_graph=True)
+   torchviz.make_dot((grad_x, x, out), params={"grad_x": grad_x, "x": x, "out": out})
+
+.. image:: https://user-images.githubusercontent.com/13428986/126559935-74526b4d-d419-4983-b1f0-a6ee99428531.png
+   :width: 352
+
+
+To conclude, whether double backward works for your custom function
+simply depends on whether the backward pass can be tracked by autograd.
+With the first two examples we show situations where double backward
+works out of the box. With the third and fourth examples, we demonstrate
+techniques that enable a backward function to be tracked, when they
+otherwise would not be.
+
diff --git a/intermediate_source/ddp_series_minGPT.rst b/intermediate_source/ddp_series_minGPT.rst
new file mode 100644
index 00000000000..27e63996f75
--- /dev/null
+++ b/intermediate_source/ddp_series_minGPT.rst
@@ -0,0 +1,86 @@
+`Introduction <../beginner/ddp_series_intro.html>`__ \|\| `What is DDP <../beginner/ddp_series_theory.html>`__ \|\| `Single-Node
+Multi-GPU Training <../beginner/ddp_series_multigpu.html>`__ \|\| `Fault
+Tolerance <../beginner/ddp_series_fault_tolerance.html>`__ \|\| `Multi-Node
+training <ddp_series_multinode.html>`__ \|\| **minGPT Training**
+
+Training “real-world” models with DDP
+=====================================
+
+Authors: `Suraj Subramanian <https://github.com/subramen>`__
+
+.. grid:: 2
+
+   .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn
+      :class-card: card-prerequisites
+
+      -  Best practices when writing a distributed training script
+      -  Increased flexibility with saving/loading artifacts in the cloud
+      -  When DDP is NOT suitable
+
+      .. grid:: 1
+
+         .. grid-item::
+
+            :octicon:`code-square;1.0em;` View the code used in this tutorial on `GitHub <https://github.com/pytorch/examples/tree/main/distributed/minGPT-ddp>`__
+
+   .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites
+      :class-card: card-prerequisites
+
+      - PyTorch `installed <https://pytorch.org/get-started/locally/>`__ with CUDA on all machines
+      - Familiarity with `multi-GPU training <../beginner/ddp_series_multigpu.html>`__ and `torchrun <../beginner/ddp_series_fault_tolerance.html>`__
+      - [Optional] Familiarity with `multinode training <ddp_series_multinode.html>`__
+      - 2 or more TCP-reachable GPU machines for multi-node training (this tutorial uses AWS p3.2xlarge instances)
+
+
+Follow along with the video below or on `youtube <https://www.youtube.com/watch/XFsFDGKZHh4>`__.
+
+.. raw:: html
+
+   <div style="margin-top:10px; margin-bottom:10px;">
+     <iframe width="560" height="315" src="https://www.youtube.com/embed/XFsFDGKZHh4" frameborder="0" allow="accelerometer; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
+   </div>
+
+In this video, we will review the process of training a GPT model in multinode DDP.
+We first clone the `minGPT repo <https://github.com/karpathy/minGPT>`__ and refactor the Trainer
+to resemble the structure we have used in this series. Watch the video for details on these changes.
+
+We use `hydra <https://hydra.cc/>`__ to centrally manage all the configurations for our training run.
+Once the code has been refactored, we run it first on a single-node with 4 GPUs, and then on a slurm cluster.
+
+Files used for training
+~~~~~~~~~~~~~~~~~~~~~~~~
+- `trainer.py <https://github.com/pytorch/examples/blob/main/distributed/minGPT-ddp/mingpt/trainer.py>`__ includes the Trainer class that runs the distributed training iterations on the model with the provided dataset.
+- `model.py <https://github.com/pytorch/examples/blob/main/distributed/minGPT-ddp/mingpt/model.py>`__ defines the model architecture.
+- `char_dataset.py <https://github.com/pytorch/examples/blob/main/distributed/minGPT-ddp/mingpt/char_dataset.py>`__ contains the ``Dataset`` class for a character-level dataset.
+- `gpt2_train_cfg.yaml <https://github.com/pytorch/examples/blob/main/distributed/minGPT-ddp/mingpt/gpt2_train_cfg.yaml>`__ contains the configurations for data, model, optimizer, and training run.
+- `main.py <https://github.com/pytorch/examples/blob/main/distributed/minGPT-ddp/mingpt/main.py>`__ is the entry point to the training job. It sets up the DDP process group, reads all the configurations and runs the training job.
+
+
+Saving and Loading from the cloud
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+In the video above, we save training snapshots directly to the cloud. This gives us the flexibility to continue training
+from any node that has access to the cloud bucket.
+
+
+Using Mixed Precision
+~~~~~~~~~~~~~~~~~~~~~~~~
+To speed things up, you might be able to use `Mixed Precision <https://pytorch.org/docs/stable/amp.html>`__ to train your models.
+In Mixed Precision, some parts of the training process are carried out in reduced precision, while other steps
+that are more sensitive to precision drops are maintained in FP32 precision.
+
+
+When is DDP not enough?
+~~~~~~~~~~~~~~~~~~~~~~~~
+A typical training run's memory footprint consists of model weights, activations, gradients, the input batch, and the optimizer state.
+Since DDP replicates the model on each GPU, it only works when GPUs have sufficient capacity to accomodate the full footprint.
+When models grow larger, more aggressive techniques might be useful:
+
+-  `Activation checkpointing <https://pytorch.org/docs/stable/checkpoint.html>`__: Instead of saving intermediate activations during the forward pass, the activations are recomputed during the backward pass. In this approach, we run more compute but save on memory footprint.
+-  `Fully-Sharded Data Parallel <https://docs.pytorch.org/docs/stable/distributed.fsdp.fully_shard.html>`__: Here the model is not replicated but "sharded" across all the GPUs, and computation is overlapped with communication in the forward and backward passes. Read our `blog <https://medium.com/pytorch/training-a-1-trillion-parameter-model-with-pytorch-fully-sharded-data-parallel-on-aws-3ac13aa96cff>`__ to learn how we trained a 1 Trillion parameter model with FSDP.
+
+Further Reading
+---------------
+-  `Multi-Node training with DDP <ddp_series_multinode.html>`__ (previous tutorial in this series)
+-  `Mixed Precision training <https://pytorch.org/docs/stable/amp.html>`__
+-  `Fully-Sharded Data Parallel tutorial <https://docs.pytorch.org/tutorials/intermediate/FSDP_tutorial.html>`__
+-  `Training a 1T parameter model with FSDP <https://medium.com/pytorch/training-a-1-trillion-parameter-model-with-pytorch-fully-sharded-data-parallel-on-aws-3ac13aa96cff>`__
diff --git a/intermediate_source/ddp_series_multinode.rst b/intermediate_source/ddp_series_multinode.rst
new file mode 100644
index 00000000000..8746eb19bbd
--- /dev/null
+++ b/intermediate_source/ddp_series_multinode.rst
@@ -0,0 +1,94 @@
+`Introduction <../beginner/ddp_series_intro.html>`__ \|\| `What is DDP <../beginner/ddp_series_theory.html>`__ \|\| `Single-Node
+Multi-GPU Training <../beginner/ddp_series_multigpu.html>`__ \|\| `Fault
+Tolerance <../beginner/ddp_series_fault_tolerance.html>`__ \|\| **Multi-Node
+training** \|\| `minGPT Training <ddp_series_minGPT.html>`__
+
+Multinode Training
+==================
+
+Authors: `Suraj Subramanian <https://github.com/subramen>`__
+
+.. grid:: 2
+
+   .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn
+      :class-card: card-prerequisites
+
+      -  Launching multinode training jobs with ``torchrun``
+      -  Code changes (and things to keep in mind) when moving from single-node to multinode training.
+
+      .. grid:: 1
+
+         .. grid-item::
+
+            :octicon:`code-square;1.0em;` View the code used in this tutorial on `GitHub <https://github.com/pytorch/examples/blob/main/distributed/ddp-tutorial-series/multinode.py>`__
+
+   .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites
+      :class-card: card-prerequisites
+
+      -  Familiarity with `multi-GPU training <../beginner/ddp_series_multigpu.html>`__ and `torchrun <../beginner/ddp_series_fault_tolerance.html>`__ 
+      -  2 or more TCP-reachable GPU machines (this tutorial uses AWS p3.2xlarge instances)
+      -  PyTorch `installed <https://pytorch.org/get-started/locally/>`__ with CUDA on all machines
+
+Follow along with the video below or on `youtube <https://www.youtube.com/watch/KaAJtI1T2x4>`__. 
+
+.. raw:: html
+
+   <div style="margin-top:10px; margin-bottom:10px;">
+     <iframe width="560" height="315" src="https://www.youtube.com/embed/KaAJtI1T2x4" frameborder="0" allow="accelerometer; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
+   </div>
+
+Multinode training involves deploying a training job across several
+machines. There are two ways to do this:
+
+-  running a ``torchrun`` command on each machine with identical rendezvous arguments, or
+-  deploying it on a compute cluster using a workload manager (like SLURM)
+
+In this video we will go over the (minimal) code changes required to move from single-node multigpu to
+multinode training, and run our training script in both of the above ways.
+
+Note that multinode training is bottlenecked by inter-node communication latencies. Running a training job
+on 4 GPUs on a single node will be faster than running it on 4 nodes with 1 GPU each.
+
+Local and Global ranks
+~~~~~~~~~~~~~~~~~~~~~~~~
+In single-node settings, we were tracking the 
+``gpu_id`` of each device running our training process. ``torchrun`` tracks this value in an environment variable ``LOCAL_RANK``
+which uniquely identifies each GPU-process on a node. For a unique identifier across all the nodes, ``torchrun`` provides another variable
+``RANK`` which refers to the global rank of a process.
+
+.. warning::
+   Do not use ``RANK`` for critical logic in your training job. When ``torchrun`` restarts processes after a failure or membership changes, there is no guarantee
+   that the processes will hold the same ``LOCAL_RANK`` and ``RANKS``. 
+ 
+
+Heteregeneous Scaling
+~~~~~~~~~~~~~~~~~~~~~~
+Torchrun supports *heteregenous scaling* i.e. each of your multinode machines can have different number of 
+GPUs participating in the training job. In the video, I deployed the code on 2 machines where one machine has 4 GPUs and the
+other used only 2 GPUs.
+
+
+Troubleshooting
+~~~~~~~~~~~~~~~~~~
+
+-  Ensure that your nodes are able to communicate with each other over
+   TCP.
+-  Set env variable ``NCCL_DEBUG`` to ``INFO`` (using
+   ``export NCCL_DEBUG=INFO``) to print verbose logs that can help
+   diagnose the issue.
+-  Sometimes you might need to explicitly set the network interface for
+   the distributed backend (``export NCCL_SOCKET_IFNAME=eth0``). Read
+   more about this
+   `here <https://pytorch.org/docs/stable/distributed.html#choosing-the-network-interface-to-use>`__.
+
+
+Further Reading
+---------------
+-  `Training a GPT model with DDP <ddp_series_minGPT.html>`__  (next tutorial in this series)
+-  `Fault Tolerant distributed training <../beginner/ddp_series_fault_tolerance.html>`__ (previous tutorial in this series)
+-  `torchrun <https://pytorch.org/docs/stable/elastic/run.html>`__
+-  `Rendezvous
+   arguments <https://pytorch.org/docs/stable/elastic/run.html#note-on-rendezvous-backend>`__
+-  `Setting up a cluster on
+   AWS <https://github.com/pytorch/examples/blob/main/distributed/ddp-tutorial-series/slurm/setup_pcluster_slurm.md>`__
+-  `Slurm docs <https://slurm.schedmd.com/>`__
diff --git a/intermediate_source/ddp_tutorial.rst b/intermediate_source/ddp_tutorial.rst
new file mode 100644
index 00000000000..c63321ad14c
--- /dev/null
+++ b/intermediate_source/ddp_tutorial.rst
@@ -0,0 +1,392 @@
+Getting Started with Distributed Data Parallel
+=================================================
+**Author**: `Shen Li <https://mrshenli.github.io/>`_
+
+**Edited by**: `Joe Zhu <https://github.com/gunandrose4u>`_, `Chirag Pandya <https://github.com/c-p-i-o>`__
+
+.. note::
+   |edit| View and edit this tutorial in `github <https://github.com/pytorch/tutorials/blob/main/intermediate_source/ddp_tutorial.rst>`__.
+
+Prerequisites:
+
+-  `PyTorch Distributed Overview <../beginner/dist_overview.html>`__
+-  `DistributedDataParallel API documents <https://pytorch.org/docs/master/generated/torch.nn.parallel.DistributedDataParallel.html>`__
+-  `DistributedDataParallel notes <https://pytorch.org/docs/master/notes/ddp.html>`__
+
+
+`DistributedDataParallel <https://pytorch.org/docs/stable/nn.html#module-torch.nn.parallel>`__
+(DDP) is a powerful module in PyTorch that allows you to parallelize your model across
+multiple machines, making it perfect for large-scale deep learning applications.
+To use DDP, you'll need to spawn multiple processes and create a single instance of DDP per process.
+
+But how does it work? DDP uses collective communications from the
+`torch.distributed <https://pytorch.org/tutorials/intermediate/dist_tuto.html>`__
+package to synchronize gradients and buffers across all processes. This means that each process will have
+its own copy of the model, but they'll all work together to train the model as if it were on a single machine.
+
+To make this happen, DDP registers an autograd hook for each parameter in the model.
+When the backward pass is run, this hook fires and triggers gradient synchronization across all processes.
+This ensures that each process has the same gradients, which are then used to update the model.
+
+For more information on how DDP works and how to use it effectively, be sure to check out the
+`DDP design note <https://pytorch.org/docs/master/notes/ddp.html>`__.
+With DDP, you can train your models faster and more efficiently than ever before!
+
+The recommended way to use DDP is to spawn one process for each model replica. The model replica can span
+multiple devices. DDP processes can be placed on the same machine or across machines. Note that GPU devices
+cannot be shared across DDP processes (i.e. one GPU for one DDP process).
+
+
+In this tutorial, we'll start with a basic DDP use case and then demonstrate more advanced use cases,
+including checkpointing models and combining DDP with model parallel.
+
+
+.. note::
+  The code in this tutorial runs on an 8-GPU server, but it can be easily
+  generalized to other environments.
+
+
+Comparison between ``DataParallel`` and ``DistributedDataParallel``
+-------------------------------------------------------------------
+
+Before we dive in, let's clarify why you would consider using ``DistributedDataParallel``
+over ``DataParallel``, despite its added complexity:
+
+- First, ``DataParallel`` is single-process, multi-threaded, but it only works on a
+  single machine. In contrast, ``DistributedDataParallel`` is multi-process and supports
+  both single- and multi- machine training.
+  Due to GIL contention across threads, per-iteration replicated model, and additional overhead introduced by
+  scattering inputs and gathering outputs, ``DataParallel`` is usually
+  slower than ``DistributedDataParallel`` even on a single machine.
+- Recall from the
+  `prior tutorial <https://pytorch.org/tutorials/intermediate/model_parallel_tutorial.html>`__
+  that if your model is too large to fit on a single GPU, you must use **model parallel**
+  to split it across multiple GPUs. ``DistributedDataParallel`` works with
+  **model parallel**, while ``DataParallel`` does not at this time. When DDP is combined
+  with model parallel, each DDP process would use model parallel, and all processes
+  collectively would use data parallel.
+
+Basic Use Case
+--------------
+
+To create a DDP module, you must first set up process groups properly. More details can
+be found in
+`Writing Distributed Applications with PyTorch <https://pytorch.org/tutorials/intermediate/dist_tuto.html>`__.
+
+.. code:: python
+
+    import os
+    import sys
+    import tempfile
+    import torch
+    import torch.distributed as dist
+    import torch.nn as nn
+    import torch.optim as optim
+    import torch.multiprocessing as mp
+
+    from torch.nn.parallel import DistributedDataParallel as DDP
+
+    # On Windows platform, the torch.distributed package only
+    # supports Gloo backend, FileStore and TcpStore.
+    # For FileStore, set init_method parameter in init_process_group
+    # to a local file. Example as follow:
+    # init_method="file:///f:/libtmp/some_file"
+    # dist.init_process_group(
+    #    "gloo",
+    #    rank=rank,
+    #    init_method=init_method,
+    #    world_size=world_size)
+    # For TcpStore, same way as on Linux.
+
+    def setup(rank, world_size):
+        os.environ['MASTER_ADDR'] = 'localhost'
+        os.environ['MASTER_PORT'] = '12355'
+
+        # We want to be able to train our model on an `accelerator <https://pytorch.org/docs/stable/torch.html#accelerators>`__
+        # such as CUDA, MPS, MTIA, or XPU.
+        acc = torch.accelerator.current_accelerator()
+        backend = torch.distributed.get_default_backend_for_device(acc)
+        # initialize the process group
+        dist.init_process_group(backend, rank=rank, world_size=world_size)
+
+    def cleanup():
+        dist.destroy_process_group()
+
+Now, let's create a toy module, wrap it with DDP, and feed it some dummy
+input data. Please note, as DDP broadcasts model states from rank 0 process to
+all other processes in the DDP constructor, you do not need to worry about
+different DDP processes starting from different initial model parameter values.
+
+.. code:: python
+
+    class ToyModel(nn.Module):
+        def __init__(self):
+            super(ToyModel, self).__init__()
+            self.net1 = nn.Linear(10, 10)
+            self.relu = nn.ReLU()
+            self.net2 = nn.Linear(10, 5)
+
+        def forward(self, x):
+            return self.net2(self.relu(self.net1(x)))
+
+
+    def demo_basic(rank, world_size):
+        print(f"Running basic DDP example on rank {rank}.")
+        setup(rank, world_size)
+
+        # create model and move it to GPU with id rank
+        model = ToyModel().to(rank)
+        ddp_model = DDP(model, device_ids=[rank])
+
+        loss_fn = nn.MSELoss()
+        optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)
+
+        optimizer.zero_grad()
+        outputs = ddp_model(torch.randn(20, 10))
+        labels = torch.randn(20, 5).to(rank)
+        loss_fn(outputs, labels).backward()
+        optimizer.step()
+
+        cleanup()
+        print(f"Finished running basic DDP example on rank {rank}.")
+
+
+    def run_demo(demo_fn, world_size):
+        mp.spawn(demo_fn,
+                 args=(world_size,),
+                 nprocs=world_size,
+                 join=True)
+
+As you can see, DDP wraps lower-level distributed communication details and
+provides a clean API as if it were a local model. Gradient synchronization
+communications take place during the backward pass and overlap with the
+backward computation. When the ``backward()`` returns, ``param.grad`` already
+contains the synchronized gradient tensor. For basic use cases, DDP only
+requires a few more lines of code to set up the process group. When applying DDP to more
+advanced use cases, some caveats require caution.
+
+Skewed Processing Speeds
+------------------------
+
+In DDP, the constructor, the forward pass, and the backward pass are
+distributed synchronization points. Different processes are expected to launch
+the same number of synchronizations and reach these synchronization points in
+the same order and enter each synchronization point at roughly the same time.
+Otherwise, fast processes might arrive early and timeout while waiting for
+stragglers. Hence, users are responsible for balancing workload distributions
+across processes. Sometimes, skewed processing speeds are inevitable due to,
+e.g., network delays, resource contentions, or unpredictable workload spikes. To
+avoid timeouts in these situations, make sure that you pass a sufficiently
+large ``timeout`` value when calling
+`init_process_group <https://pytorch.org/docs/stable/distributed.html#torch.distributed.init_process_group>`__.
+
+Save and Load Checkpoints
+-------------------------
+
+It's common to use ``torch.save`` and ``torch.load`` to checkpoint modules
+during training and recover from checkpoints. See
+`SAVING AND LOADING MODELS <https://pytorch.org/tutorials/beginner/saving_loading_models.html>`__
+for more details. When using DDP, one optimization is to save the model in
+only one process and then load it on all processes, reducing write overhead.
+This works because all processes start from the same parameters and
+gradients are synchronized in backward passes, and hence optimizers should keep
+setting parameters to the same values.
+If you use this optimization (i.e. save on one process but restore on all), make sure no process starts
+loading before the saving is finished. Additionally, when
+loading the module, you need to provide an appropriate ``map_location``
+argument to prevent processes from stepping into others' devices. If ``map_location``
+is missing, ``torch.load`` will first load the module to CPU and then copy each
+parameter to where it was saved, which would result in all processes on the
+same machine using the same set of devices. For more advanced failure recovery
+and elasticity support, please refer to `TorchElastic <https://pytorch.org/elastic>`__.
+
+.. code:: python
+
+    def demo_checkpoint(rank, world_size):
+        print(f"Running DDP checkpoint example on rank {rank}.")
+        setup(rank, world_size)
+
+        model = ToyModel().to(rank)
+        ddp_model = DDP(model, device_ids=[rank])
+
+
+        CHECKPOINT_PATH = tempfile.gettempdir() + "/model.checkpoint"
+        if rank == 0:
+            # All processes should see same parameters as they all start from same
+            # random parameters and gradients are synchronized in backward passes.
+            # Therefore, saving it in one process is sufficient.
+            torch.save(ddp_model.state_dict(), CHECKPOINT_PATH)
+
+        # Use a barrier() to make sure that process 1 loads the model after process
+        # 0 saves it.
+        dist.barrier()
+        # We want to be able to train our model on an `accelerator <https://pytorch.org/docs/stable/torch.html#accelerators>`__
+        # such as CUDA, MPS, MTIA, or XPU.
+        acc = torch.accelerator.current_accelerator()
+        # configure map_location properly
+        map_location = {f'{acc}:0': f'{acc}:{rank}'}
+        ddp_model.load_state_dict(
+            torch.load(CHECKPOINT_PATH, map_location=map_location, weights_only=True))
+
+        loss_fn = nn.MSELoss()
+        optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)
+
+        optimizer.zero_grad()
+        outputs = ddp_model(torch.randn(20, 10))
+        labels = torch.randn(20, 5).to(rank)
+
+        loss_fn(outputs, labels).backward()
+        optimizer.step()
+
+        # Not necessary to use a dist.barrier() to guard the file deletion below
+        # as the AllReduce ops in the backward pass of DDP already served as
+        # a synchronization.
+
+        if rank == 0:
+            os.remove(CHECKPOINT_PATH)
+
+        cleanup()
+        print(f"Finished running DDP checkpoint example on rank {rank}.")
+
+Combining DDP with Model Parallelism
+------------------------------------
+
+DDP also works with multi-GPU models. DDP wrapping multi-GPU models is especially
+helpful when training large models with a huge amount of data.
+
+.. code:: python
+
+    class ToyMpModel(nn.Module):
+        def __init__(self, dev0, dev1):
+            super(ToyMpModel, self).__init__()
+            self.dev0 = dev0
+            self.dev1 = dev1
+            self.net1 = torch.nn.Linear(10, 10).to(dev0)
+            self.relu = torch.nn.ReLU()
+            self.net2 = torch.nn.Linear(10, 5).to(dev1)
+
+        def forward(self, x):
+            x = x.to(self.dev0)
+            x = self.relu(self.net1(x))
+            x = x.to(self.dev1)
+            return self.net2(x)
+
+When passing a multi-GPU model to DDP, ``device_ids`` and ``output_device``
+must NOT be set. Input and output data will be placed in proper devices by
+either the application or the model ``forward()`` method.
+
+.. code:: python
+
+    def demo_model_parallel(rank, world_size):
+        print(f"Running DDP with model parallel example on rank {rank}.")
+        setup(rank, world_size)
+
+        # setup mp_model and devices for this process
+        dev0 = rank * 2
+        dev1 = rank * 2 + 1
+        mp_model = ToyMpModel(dev0, dev1)
+        ddp_mp_model = DDP(mp_model)
+
+        loss_fn = nn.MSELoss()
+        optimizer = optim.SGD(ddp_mp_model.parameters(), lr=0.001)
+
+        optimizer.zero_grad()
+        # outputs will be on dev1
+        outputs = ddp_mp_model(torch.randn(20, 10))
+        labels = torch.randn(20, 5).to(dev1)
+        loss_fn(outputs, labels).backward()
+        optimizer.step()
+
+        cleanup()
+        print(f"Finished running DDP with model parallel example on rank {rank}.")
+
+
+    if __name__ == "__main__":
+        n_gpus = torch.accelerator.device_count()
+        assert n_gpus >= 2, f"Requires at least 2 GPUs to run, but got {n_gpus}"
+        world_size = n_gpus
+        run_demo(demo_basic, world_size)
+        run_demo(demo_checkpoint, world_size)
+        world_size = n_gpus//2
+        run_demo(demo_model_parallel, world_size)
+
+Initialize DDP with torch.distributed.run/torchrun
+---------------------------------------------------
+
+We can leverage PyTorch Elastic to simplify the DDP code and initialize the job more easily.
+Let's still use the Toymodel example and create a file named ``elastic_ddp.py``.
+
+.. code:: python
+
+    import os
+    import torch
+    import torch.distributed as dist
+    import torch.nn as nn
+    import torch.optim as optim
+
+    from torch.nn.parallel import DistributedDataParallel as DDP
+
+    class ToyModel(nn.Module):
+        def __init__(self):
+            super(ToyModel, self).__init__()
+            self.net1 = nn.Linear(10, 10)
+            self.relu = nn.ReLU()
+            self.net2 = nn.Linear(10, 5)
+
+        def forward(self, x):
+            return self.net2(self.relu(self.net1(x)))
+
+
+    def demo_basic():
+        torch.accelerator.set_device_index(int(os.environ["LOCAL_RANK"]))
+        acc = torch.accelerator.current_accelerator()
+        backend = torch.distributed.get_default_backend_for_device(acc)
+        dist.init_process_group(backend)
+        rank = dist.get_rank()
+        print(f"Start running basic DDP example on rank {rank}.")
+        # create model and move it to GPU with id rank
+        device_id = rank % torch.accelerator.device_count()
+        model = ToyModel().to(device_id)
+        ddp_model = DDP(model, device_ids=[device_id])
+        loss_fn = nn.MSELoss()
+        optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)
+
+        optimizer.zero_grad()
+        outputs = ddp_model(torch.randn(20, 10))
+        labels = torch.randn(20, 5).to(device_id)
+        loss_fn(outputs, labels).backward()
+        optimizer.step()
+        dist.destroy_process_group()
+        print(f"Finished running basic DDP example on rank {rank}.")
+
+    if __name__ == "__main__":
+        demo_basic()
+
+One can then run a `torch elastic/torchrun <https://pytorch.org/docs/stable/elastic/quickstart.html>`__ command
+on all nodes to initialize the DDP job created above:
+
+.. code:: bash
+
+    torchrun --nnodes=2 --nproc_per_node=8 --rdzv_id=100 --rdzv_backend=c10d --rdzv_endpoint=$MASTER_ADDR:29400 elastic_ddp.py
+
+In the example above, we are running the DDP script on two hosts and we run with 8 processes on each host. That is,  we
+are running this job on 16 GPUs. Note that ``$MASTER_ADDR`` must be the same across all nodes.
+
+Here ``torchrun`` will launch 8 processes and invoke ``elastic_ddp.py``
+on each process on the node it is launched on, but user also needs to apply cluster
+management tools like slurm to actually run this command on 2 nodes.
+
+For example, on a SLURM enabled cluster, we can write a script to run the command above
+and set ``MASTER_ADDR`` as:
+
+.. code:: bash
+
+    export MASTER_ADDR=$(scontrol show hostname ${SLURM_NODELIST} | head -n 1)
+
+
+Then we can just run this script using the SLURM command: ``srun --nodes=2 ./torchrun_script.sh``.
+
+This is just an example; you can choose your own cluster scheduling tools to initiate the ``torchrun`` job.
+
+For more information about Elastic run, please see the
+`quick start document <https://pytorch.org/docs/stable/elastic/quickstart.html>`__.
diff --git a/intermediate_source/dist_pipeline_parallel_tutorial.rst b/intermediate_source/dist_pipeline_parallel_tutorial.rst
new file mode 100644
index 00000000000..ec3e3cf304a
--- /dev/null
+++ b/intermediate_source/dist_pipeline_parallel_tutorial.rst
@@ -0,0 +1,10 @@
+Distributed Pipeline Parallelism Using RPC
+==========================================
+
+This tutorial has been deprecated.
+
+Redirecting to a newer tutorial in 3 seconds...
+
+.. raw:: html
+
+   <meta http-equiv="Refresh" content="3; url='https://pytorch.org/tutorials/intermediate/pipelining_tutorial.html'" />
diff --git a/intermediate_source/dist_tuto.rst b/intermediate_source/dist_tuto.rst
new file mode 100644
index 00000000000..cebc986a190
--- /dev/null
+++ b/intermediate_source/dist_tuto.rst
@@ -0,0 +1,667 @@
+Writing Distributed Applications with PyTorch
+=============================================
+**Author**: `Séb Arnold <https://seba1511.com>`_
+
+.. note::
+   |edit| View and edit this tutorial in `github <https://github.com/pytorch/tutorials/blob/main/intermediate_source/dist_tuto.rst>`__.
+
+Prerequisites:
+
+-  `PyTorch Distributed Overview <../beginner/dist_overview.html>`__
+
+In this short tutorial, we will be going over the distributed package
+of PyTorch. We'll see how to set up the distributed setting, use the
+different communication strategies, and go over some of the internals of
+the package.
+
+Setup
+-----
+
+.. raw:: html
+
+   <!--
+   * Processes & machines
+   * variables and init_process_group
+   -->
+
+The distributed package included in PyTorch (i.e.,
+``torch.distributed``) enables researchers and practitioners to easily
+parallelize their computations across processes and clusters of
+machines. To do so, it leverages message passing semantics
+allowing each process to communicate data to any of the other processes.
+As opposed to the multiprocessing (``torch.multiprocessing``) package,
+processes can use different communication backends and are not
+restricted to being executed on the same machine.
+
+In order to get started we need the ability to run multiple processes
+simultaneously. If you have access to compute cluster you should check
+with your local sysadmin or use your favorite coordination tool (e.g.,
+`pdsh <https://linux.die.net/man/1/pdsh>`__,
+`clustershell <https://cea-hpc.github.io/clustershell/>`__, or
+`slurm <https://slurm.schedmd.com/>`__). For the purpose of this
+tutorial, we will use a single machine and spawn multiple processes using
+the following template.
+
+.. code:: python
+
+    """run.py:"""
+    #!/usr/bin/env python
+    import os
+    import sys
+    import torch
+    import torch.distributed as dist
+    import torch.multiprocessing as mp
+
+    def run(rank, size):
+        """ Distributed function to be implemented later. """
+        pass
+
+    def init_process(rank, size, fn, backend='gloo'):
+        """ Initialize the distributed environment. """
+        os.environ['MASTER_ADDR'] = '127.0.0.1'
+        os.environ['MASTER_PORT'] = '29500'
+        dist.init_process_group(backend, rank=rank, world_size=size)
+        fn(rank, size)
+
+
+    if __name__ == "__main__":
+        world_size = 2
+        processes = []
+        if "google.colab" in sys.modules:
+            print("Running in Google Colab")
+            mp.get_context("spawn")
+        else:
+            mp.set_start_method("spawn")
+        for rank in range(world_size):
+            p = mp.Process(target=init_process, args=(rank, world_size, run))
+            p.start()
+            processes.append(p)
+
+        for p in processes:
+            p.join()
+
+The above script spawns two processes who will each setup the
+distributed environment, initialize the process group
+(``dist.init_process_group``), and finally execute the given ``run``
+function.
+
+Let's have a look at the ``init_process`` function. It ensures that
+every process will be able to coordinate through a master, using the
+same ip address and port. Note that we used the ``gloo`` backend but
+other backends are available. (c.f.
+`Section 5.1 <#communication-backends>`__) We will go over the magic
+happening in ``dist.init_process_group`` at the end of this tutorial,
+but it essentially allows processes to communicate with each other by
+sharing their locations.
+
+Point-to-Point Communication
+----------------------------
+
+.. figure:: /_static/img/distributed/send_recv.png
+   :width: 100%
+   :align: center
+   :alt: Send and Recv
+
+   Send and Recv
+
+
+A transfer of data from one process to another is called a
+point-to-point communication. These are achieved through the ``send``
+and ``recv`` functions or their *immediate* counter-parts, ``isend`` and
+``irecv``.
+
+.. code:: python
+
+    """Blocking point-to-point communication."""
+
+    def run(rank, size):
+        tensor = torch.zeros(1)
+        if rank == 0:
+            tensor += 1
+            # Send the tensor to process 1
+            dist.send(tensor=tensor, dst=1)
+        else:
+            # Receive tensor from process 0
+            dist.recv(tensor=tensor, src=0)
+        print('Rank ', rank, ' has data ', tensor[0])
+
+In the above example, both processes start with a zero tensor, then
+process 0 increments the tensor and sends it to process 1 so that they
+both end up with 1.0. Notice that process 1 needs to allocate memory in
+order to store the data it will receive.
+
+Also notice that ``send/recv`` are **blocking**: both processes block
+until the communication is completed. On the other hand immediates are
+**non-blocking**; the script continues its execution and the methods
+return a ``Work`` object upon which we can choose to
+``wait()``.
+
+.. code:: python
+
+    """Non-blocking point-to-point communication."""
+
+    def run(rank, size):
+        tensor = torch.zeros(1)
+        req = None
+        if rank == 0:
+            tensor += 1
+            # Send the tensor to process 1
+            req = dist.isend(tensor=tensor, dst=1)
+            print('Rank 0 started sending')
+        else:
+            # Receive tensor from process 0
+            req = dist.irecv(tensor=tensor, src=0)
+            print('Rank 1 started receiving')
+        req.wait()
+        print('Rank ', rank, ' has data ', tensor[0])
+
+When using immediates we have to be careful about how we use the sent and received tensors.
+Since we do not know when the data will be communicated to the other process,
+we should not modify the sent tensor nor access the received tensor before ``req.wait()`` has completed.
+In other words,
+
+-  writing to ``tensor`` after ``dist.isend()`` will result in undefined behaviour.
+- reading from ``tensor`` after ``dist.irecv()`` will result in undefined
+  behaviour, until ``req.wait()`` has been executed.
+
+However, after ``req.wait()``
+has been executed we are guaranteed that the communication took place,
+and that the value stored in ``tensor[0]`` is 1.0.
+
+Point-to-point communication is useful when we want more fine-grained
+control over the communication of our processes. They can be used to
+implement fancy algorithms, such as the one used in `Baidu's
+DeepSpeech <https://github.com/baidu-research/baidu-allreduce>`__ or
+`Facebook's large-scale
+experiments <https://research.fb.com/publications/imagenet1kin1h/>`__.(c.f.
+`Section 4.1 <#our-own-ring-allreduce>`__)
+
+Collective Communication
+------------------------
+
++----------------------------------------------------+-----------------------------------------------------+
+| .. figure:: /_static/img/distributed/scatter.png   | .. figure:: /_static/img/distributed/gather.png     |
+|   :alt: Scatter                                    |   :alt: Gather                                      |
+|   :width: 100%                                     |   :width: 100%                                      |
+|   :align: center                                   |   :align: center                                    |
+|                                                    |                                                     |
+|   Scatter                                          |   Gather                                            |
++----------------------------------------------------+-----------------------------------------------------+
+| .. figure:: /_static/img/distributed/reduce.png    | .. figure:: /_static/img/distributed/all_reduce.png |
+|   :alt: Reduce                                     |   :alt: All-Reduce                                  |
+|   :width: 100%                                     |   :width: 100%                                      |
+|   :align: center                                   |   :align: center                                    |
+|                                                    |                                                     |
+|   Reduce                                           |   All-Reduce                                        |
++----------------------------------------------------+-----------------------------------------------------+
+| .. figure:: /_static/img/distributed/broadcast.png | .. figure:: /_static/img/distributed/all_gather.png |
+|   :alt: Broadcast                                  |   :alt: All-Gather                                  |
+|   :width: 100%                                     |   :width: 100%                                      |
+|   :align: center                                   |   :align: center                                    |
+|                                                    |                                                     |
+|   Broadcast                                        |   All-Gather                                        |
++----------------------------------------------------+-----------------------------------------------------+
+
+
+
+As opposed to point-to-point communcation, collectives allow for
+communication patterns across all processes in a **group**. A group is a
+subset of all our processes. To create a group, we can pass a list of
+ranks to ``dist.new_group(group)``. By default, collectives are executed
+on all processes, also known as the **world**. For example, in order
+to obtain the sum of all tensors on all processes, we can use the
+``dist.all_reduce(tensor, op, group)`` collective.
+
+.. code:: python
+
+    """ All-Reduce example."""
+    def run(rank, size):
+        """ Simple collective communication. """
+        group = dist.new_group([0, 1])
+        tensor = torch.ones(1)
+        dist.all_reduce(tensor, op=dist.ReduceOp.SUM, group=group)
+        print('Rank ', rank, ' has data ', tensor[0])
+
+Since we want the sum of all tensors in the group, we use
+``dist.ReduceOp.SUM`` as the reduce operator. Generally speaking, any
+commutative mathematical operation can be used as an operator.
+Out-of-the-box, PyTorch comes with many such operators, all working at the
+element-wise level:
+
+-  ``dist.ReduceOp.SUM``,
+-  ``dist.ReduceOp.PRODUCT``,
+-  ``dist.ReduceOp.MAX``,
+-  ``dist.ReduceOp.MIN``,
+-  ``dist.ReduceOp.BAND``,
+-  ``dist.ReduceOp.BOR``,
+-  ``dist.ReduceOp.BXOR``,
+-  ``dist.ReduceOp.PREMUL_SUM``.
+
+The full list of supported operators is
+`here <https://pytorch.org/docs/stable/distributed.html#torch.distributed.ReduceOp>`__.
+
+In addition to ``dist.all_reduce(tensor, op, group)``, there are many additional collectives currently implemented in
+PyTorch. Here are a few supported collectives.
+
+-  ``dist.broadcast(tensor, src, group)``: Copies ``tensor`` from
+   ``src`` to all other processes.
+-  ``dist.reduce(tensor, dst, op, group)``: Applies ``op`` to every
+   ``tensor`` and stores the result in ``dst``.
+-  ``dist.all_reduce(tensor, op, group)``: Same as reduce, but the
+   result is stored in all processes.
+-  ``dist.scatter(tensor, scatter_list, src, group)``: Copies the
+   :math:`i^{\text{th}}` tensor ``scatter_list[i]`` to the
+   :math:`i^{\text{th}}` process.
+-  ``dist.gather(tensor, gather_list, dst, group)``: Copies ``tensor``
+   from all processes in ``dst``.
+-  ``dist.all_gather(tensor_list, tensor, group)``: Copies ``tensor``
+   from all processes to ``tensor_list``, on all processes.
+-  ``dist.barrier(group)``: Blocks all processes in `group` until each one has entered this function.
+-  ``dist.all_to_all(output_tensor_list, input_tensor_list, group)``: Scatters list of input tensors to all processes in
+   a group and return gathered list of tensors in output list.
+
+The full list of supported collectives can be found by looking at the latest documentation for PyTorch Distributed
+`(link) <https://pytorch.org/docs/stable/distributed.html>`__.
+
+
+Distributed Training
+--------------------
+
+.. raw:: html
+
+   <!--
+   * Gloo Backend
+   * Simple all_reduce on the gradients
+   * Point to optimized DistributedDataParallel
+
+   TODO: Custom ring-allreduce
+   -->
+
+**Note:** You can find the example script of this section in `this
+GitHub repository <https://github.com/seba-1511/dist_tuto.pth/>`__.
+
+Now that we understand how the distributed module works, let us write
+something useful with it. Our goal will be to replicate the
+functionality of
+`DistributedDataParallel <https://pytorch.org/docs/stable/nn.html#torch.nn.parallel.DistributedDataParallel>`__.
+Of course, this will be a didactic example and in a real-world
+situation you should use the official, well-tested and well-optimized
+version linked above.
+
+Quite simply we want to implement a distributed version of stochastic
+gradient descent. Our script will let all processes compute the
+gradients of their model on their batch of data and then average their
+gradients. In order to ensure similar convergence results when changing
+the number of processes, we will first have to partition our dataset.
+(You could also use
+`torch.utils.data.random_split <https://pytorch.org/docs/stable/data.html#torch.utils.data.random_split>`__,
+instead of the snippet below.)
+
+.. code:: python
+
+    """ Dataset partitioning helper """
+    class Partition(object):
+
+        def __init__(self, data, index):
+            self.data = data
+            self.index = index
+
+        def __len__(self):
+            return len(self.index)
+
+        def __getitem__(self, index):
+            data_idx = self.index[index]
+            return self.data[data_idx]
+
+
+    class DataPartitioner(object):
+
+        def __init__(self, data, sizes=[0.7, 0.2, 0.1], seed=1234):
+            self.data = data
+            self.partitions = []
+            rng = Random()  # from random import Random
+            rng.seed(seed)
+            data_len = len(data)
+            indexes = [x for x in range(0, data_len)]
+            rng.shuffle(indexes)
+
+            for frac in sizes:
+                part_len = int(frac * data_len)
+                self.partitions.append(indexes[0:part_len])
+                indexes = indexes[part_len:]
+
+        def use(self, partition):
+            return Partition(self.data, self.partitions[partition])
+
+With the above snippet, we can now simply partition any dataset using
+the following few lines:
+
+.. code:: python
+
+    """ Partitioning MNIST """
+    def partition_dataset():
+        dataset = datasets.MNIST('./data', train=True, download=True,
+                                 transform=transforms.Compose([
+                                     transforms.ToTensor(),
+                                     transforms.Normalize((0.1307,), (0.3081,))
+                                 ]))
+        size = dist.get_world_size()
+        bsz = 128 // size
+        partition_sizes = [1.0 / size for _ in range(size)]
+        partition = DataPartitioner(dataset, partition_sizes)
+        partition = partition.use(dist.get_rank())
+        train_set = torch.utils.data.DataLoader(partition,
+                                             batch_size=bsz,
+                                             shuffle=True)
+        return train_set, bsz
+
+Assuming we have 2 replicas, then each process will have a ``train_set``
+of 60000 / 2 = 30000 samples. We also divide the batch size by the
+number of replicas in order to maintain the *overall* batch size of 128.
+
+We can now write our usual forward-backward-optimize training code, and
+add a function call to average the gradients of our models. (The
+following is largely inspired by the official `PyTorch MNIST
+example <https://github.com/pytorch/examples/blob/master/mnist/main.py>`__.)
+
+.. code:: python
+
+    """ Distributed Synchronous SGD Example """
+    def run(rank, size):
+        torch.manual_seed(1234)
+        train_set, bsz = partition_dataset()
+        model = Net()
+        optimizer = optim.SGD(model.parameters(),
+                              lr=0.01, momentum=0.5)
+
+        num_batches = ceil(len(train_set.dataset) / float(bsz))
+        for epoch in range(10):
+            epoch_loss = 0.0
+            for data, target in train_set:
+                optimizer.zero_grad()
+                output = model(data)
+                loss = F.nll_loss(output, target)
+                epoch_loss += loss.item()
+                loss.backward()
+                average_gradients(model)
+                optimizer.step()
+            print('Rank ', dist.get_rank(), ', epoch ',
+                  epoch, ': ', epoch_loss / num_batches)
+
+It remains to implement the ``average_gradients(model)`` function, which
+simply takes in a model and averages its gradients across the whole
+world.
+
+.. code:: python
+
+    """ Gradient averaging. """
+    def average_gradients(model):
+        size = float(dist.get_world_size())
+        for param in model.parameters():
+            dist.all_reduce(param.grad.data, op=dist.ReduceOp.SUM)
+            param.grad.data /= size
+
+*Et voilà*! We successfully implemented distributed synchronous SGD and
+could train any model on a large computer cluster.
+
+**Note:** While the last sentence is *technically* true, there are `a
+lot more tricks <https://seba-1511.github.io/dist_blog>`__ required to
+implement a production-level implementation of synchronous SGD. Again,
+use what `has been tested and
+optimized <https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel>`__.
+
+Our Own Ring-Allreduce
+~~~~~~~~~~~~~~~~~~~~~~
+
+As an additional challenge, imagine that we wanted to implement
+DeepSpeech's efficient ring allreduce. This is fairly easy to implement
+using point-to-point collectives.
+
+.. code:: python
+
+    """ Implementation of a ring-reduce with addition. """
+    def allreduce(send, recv):
+       rank = dist.get_rank()
+       size = dist.get_world_size()
+       send_buff = send.clone()
+       recv_buff = send.clone()
+       accum = send.clone()
+
+       left = ((rank - 1) + size) % size
+       right = (rank + 1) % size
+
+       for i in range(size - 1):
+           if i % 2 == 0:
+               # Send send_buff
+               send_req = dist.isend(send_buff, right)
+               dist.recv(recv_buff, left)
+               accum[:] += recv_buff[:]
+           else:
+               # Send recv_buff
+               send_req = dist.isend(recv_buff, right)
+               dist.recv(send_buff, left)
+               accum[:] += send_buff[:]
+           send_req.wait()
+       recv[:] = accum[:]
+
+In the above script, the ``allreduce(send, recv)`` function has a
+slightly different signature than the ones in PyTorch. It takes a
+``recv`` tensor and will store the sum of all ``send`` tensors in it. As
+an exercise left to the reader, there is still one difference between
+our version and the one in DeepSpeech: their implementation divides the
+gradient tensor into *chunks*, so as to optimally utilize the
+communication bandwidth. (Hint:
+`torch.chunk <https://pytorch.org/docs/stable/torch.html#torch.chunk>`__)
+
+Advanced Topics
+---------------
+
+We are now ready to discover some of the more advanced functionalities
+of ``torch.distributed``. Since there is a lot to cover, this section is
+divided into two subsections:
+
+1. Communication Backends: where we learn how to use MPI and Gloo for
+   GPU-GPU communication.
+2. Initialization Methods: where we understand how to best set up the
+   initial coordination phase in ``dist.init_process_group()``.
+
+Communication Backends
+~~~~~~~~~~~~~~~~~~~~~~
+
+One of the most elegant aspects of ``torch.distributed`` is its ability
+to abstract and build on top of different backends. As mentioned before,
+there are multiple backends implemented in PyTorch. These backends can be easily selected
+using the `Accelerator API <https://pytorch.org/docs/stable/torch.html#accelerators>`__,
+which provides a interface for working with different accelerator types.
+Some of the most popular backends are Gloo, NCCL, and MPI. They each have different specifications and tradeoffs, depending
+on the desired use case. A comparative table of supported functions can
+be found
+`here <https://pytorch.org/docs/stable/distributed.html#module-torch.distributed>`__.
+
+**Gloo Backend**
+
+So far we have made extensive usage of the `Gloo backend <https://github.com/facebookincubator/gloo>`__.
+It is quite handy as a development platform, as it is included in
+the pre-compiled PyTorch binaries and works on both Linux (since 0.2)
+and macOS (since 1.3). It supports all point-to-point and collective
+operations on CPU, and all collective operations on GPU. The
+implementation of the collective operations for CUDA tensors is not as
+optimized as the ones provided by the NCCL backend.
+
+As you have surely noticed, our
+distributed SGD example does not work if you put ``model`` on the GPU.
+In order to use multiple GPUs, let us also make the following
+modifications:
+
+1. Use Accelerator API ``device_type = torch.accelerator.current_accelerator()``
+2. Use ``torch.device(f"{device_type}:{rank}")``
+3. ``model = Net()`` :math:`\rightarrow` ``model = Net().to(device)``
+4.  Use ``data, target = data.to(device), target.to(device)``
+
+With these modifications, your model will now train across two GPUs.
+You can monitor GPU utilization using ``watch nvidia-smi`` if you are running on NVIDIA hardware.
+
+**MPI Backend**
+
+The Message Passing Interface (MPI) is a standardized tool from the
+field of high-performance computing. It allows to do point-to-point and
+collective communications and was the main inspiration for the API of
+``torch.distributed``. Several implementations of MPI exist (e.g.
+`Open-MPI <https://www.open-mpi.org/>`__,
+`MVAPICH2 <http://mvapich.cse.ohio-state.edu/>`__, `Intel
+MPI <https://software.intel.com/en-us/intel-mpi-library>`__) each
+optimized for different purposes. The advantage of using the MPI backend
+lies in MPI's wide availability - and high-level of optimization - on
+large computer clusters. `Some <https://developer.nvidia.com/mvapich>`__
+`recent <https://developer.nvidia.com/ibm-spectrum-mpi>`__
+`implementations <https://www.open-mpi.org/>`__ are also able to take
+advantage of CUDA IPC and GPU Direct technologies in order to avoid
+memory copies through the CPU.
+
+Unfortunately, PyTorch's binaries cannot include an MPI implementation
+and we'll have to recompile it by hand. Fortunately, this process is
+fairly simple given that upon compilation, PyTorch will look *by itself*
+for an available MPI implementation. The following steps install the MPI
+backend, by installing PyTorch `from
+source <https://github.com/pytorch/pytorch#from-source>`__.
+
+1. Create and activate your Anaconda environment, install all the
+   pre-requisites following `the
+   guide <https://github.com/pytorch/pytorch#from-source>`__, but do
+   **not** run ``python setup.py install`` yet.
+2. Choose and install your favorite MPI implementation. Note that
+   enabling CUDA-aware MPI might require some additional steps. In our
+   case, we'll stick to Open-MPI *without* GPU support:
+   ``conda install -c conda-forge openmpi``
+3. Now, go to your cloned PyTorch repo and execute
+   ``python setup.py install``.
+
+In order to test our newly installed backend, a few modifications are
+required.
+
+1. Replace the content under ``if __name__ == '__main__':`` with
+   ``init_process(0, 0, run, backend='mpi')``.
+2. Run ``mpirun -n 4 python myscript.py``.
+
+The reason for these changes is that MPI needs to create its own
+environment before spawning the processes. MPI will also spawn its own
+processes and perform the handshake described in `Initialization
+Methods <#initialization-methods>`__, making the ``rank``\ and ``size``
+arguments of ``init_process_group`` superfluous. This is actually quite
+powerful as you can pass additional arguments to ``mpirun`` in order to
+tailor computational resources for each process. (Things like number of
+cores per process, hand-assigning machines to specific ranks, and `some
+more <https://www.open-mpi.org/faq/?category=running#mpirun-hostfile>`__)
+Doing so, you should obtain the same familiar output as with the other
+communication backends.
+
+
+**NCCL Backend**
+
+The `NCCL backend <https://github.com/nvidia/nccl>`__ provides an
+optimized implementation of collective operations against CUDA
+tensors. If you only use CUDA tensors for your collective operations,
+consider using this backend for the best in class performance. The
+NCCL backend is included in the pre-built binaries with CUDA support.
+
+**XCCL Backend**
+
+The `XCCL backend` offers an optimized implementation of collective operations for XPU tensors.
+If your workload uses only XPU tensors for collective operations,
+this backend provides best-in-class performance.
+The XCCL backend is included in the pre-built binaries with XPU support.
+
+
+Initialization Methods
+~~~~~~~~~~~~~~~~~~~~~~
+
+To conclude this tutorial, let's examine the initial function we invoked:
+``dist.init_process_group(backend, init_method)``. Specifically, we will discuss the various
+initialization methods responsible for the preliminary coordination step between each process.
+These methods enable you to define how this coordination is accomplished.
+
+The choice of initialization method depends on your hardware setup, and one method may be more
+suitable than others. In addition to the following sections, please refer to the `official
+documentation <https://pytorch.org/docs/stable/distributed.html#initialization>`__ for further information.
+
+
+**Environment Variable**
+
+We have been using the environment variable initialization method
+throughout this tutorial. By setting the following four environment
+variables on all machines, all processes will be able to properly
+connect to the master, obtain information about the other processes, and
+finally handshake with them.
+
+-  ``MASTER_PORT``: A free port on the machine that will host the
+   process with rank 0.
+-  ``MASTER_ADDR``: IP address of the machine that will host the process
+   with rank 0.
+-  ``WORLD_SIZE``: The total number of processes, so that the master
+   knows how many workers to wait for.
+-  ``RANK``: Rank of each process, so they will know whether it is the
+   master or a worker.
+
+**Shared File System**
+
+The shared filesystem requires all processes to have access to a shared
+file system, and will coordinate them through a shared file. This means
+that each process will open the file, write its information, and wait
+until everybody did so. After that all required information will be
+readily available to all processes. In order to avoid race conditions,
+the file system must support locking through
+`fcntl <http://man7.org/linux/man-pages/man2/fcntl.2.html>`__.
+
+.. code:: python
+
+    dist.init_process_group(
+        init_method='file:///mnt/nfs/sharedfile',
+        rank=args.rank,
+        world_size=4)
+
+**TCP**
+
+Initializing via TCP can be achieved by providing the IP address of the process with rank 0 and a reachable port number.
+Here, all workers will be able to connect to the process
+with rank 0 and exchange information on how to reach each other.
+
+.. code:: python
+
+    dist.init_process_group(
+        init_method='tcp://10.1.1.20:23456',
+        rank=args.rank,
+        world_size=4)
+
+.. raw:: html
+
+   <!--
+   ## Internals
+   * The magic behind init_process_group:
+
+   1. validate and parse the arguments
+   2. resolve the backend: name2channel.at()
+   3. Drop GIL & THDProcessGroupInit: instantiate the channel and add address of master from config
+   4. rank 0 inits master, others workers
+   5. master: create sockets for all workers -> wait for all workers to connect -> send them each the info about location of other processes
+   6. worker: create socket to master, send own info, receive info about each worker, and then handshake with each of them
+   7. By this time everyone has handshake with everyone.
+   -->
+
+.. raw:: html
+
+   <center>
+
+**Acknowledgements**
+
+.. raw:: html
+
+   </center>
+
+I'd like to thank the PyTorch developers for doing such a good job on
+their implementation, documentation, and tests. When the code was
+unclear, I could always count on the
+`docs <https://pytorch.org/docs/stable/distributed.html>`__ or the
+`tests <https://github.com/pytorch/pytorch/tree/master/test/distributed>`__
+to find an answer. In particular, I'd like to thank Soumith Chintala,
+Adam Paszke, and Natalia Gimelshein for providing insightful comments
+and answering questions on early drafts.
diff --git a/intermediate_source/dqn_with_rnn_tutorial.py b/intermediate_source/dqn_with_rnn_tutorial.py
new file mode 100644
index 00000000000..bcc484f0a00
--- /dev/null
+++ b/intermediate_source/dqn_with_rnn_tutorial.py
@@ -0,0 +1,468 @@
+# -*- coding: utf-8 -*-
+
+"""
+Recurrent DQN: Training recurrent policies
+==========================================
+
+**Author**: `Vincent Moens <https://github.com/vmoens>`_
+
+.. grid:: 2
+
+    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn
+       :class-card: card-prerequisites
+
+       * How to incorporating an RNN in an actor in TorchRL
+       * How to use that memory-based policy with a replay buffer and a loss module
+
+    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites
+       :class-card: card-prerequisites
+
+       * PyTorch v2.0.0
+       * gym[mujoco]
+       * tqdm
+"""
+
+#########################################################################
+# Overview
+# --------
+#
+# Memory-based policies are crucial not only when the observations are partially
+# observable but also when the time dimension must be taken into account to
+# make informed decisions.
+#
+# Recurrent neural network have long been a popular tool for memory-based
+# policies. The idea is to keep a recurrent state in memory between two
+# consecutive steps, and use this as an input to the policy along with the
+# current observation.
+#
+# This tutorial shows how to incorporate an RNN in a policy using TorchRL.
+#
+# Key learnings:
+#
+# - Incorporating an RNN in an actor in TorchRL;
+# - Using that memory-based policy with a replay buffer and a loss module.
+#
+# The core idea of using RNNs in TorchRL is to use TensorDict as a data carrier
+# for the hidden states from one step to another. We'll build a policy that
+# reads the previous recurrent state from the current TensorDict, and writes the
+# current recurrent states in the TensorDict of the next state:
+#
+# .. figure:: /_static/img/rollout_recurrent.png
+#    :alt: Data collection with a recurrent policy
+#
+# As this figure shows, our environment populates the TensorDict with zeroed recurrent
+# states which are read by the policy together with the observation to produce an
+# action, and recurrent states that will be used for the next step.
+# When the :func:`~torchrl.envs.utils.step_mdp` function is called, the recurrent states
+# from the next state are brought to the current TensorDict. Let's see how this
+# is implemented in practice.
+
+######################################################################
+# If you are running this in Google Colab, make sure you install the following dependencies:
+#
+# .. code-block:: bash
+#
+#    !pip3 install torchrl
+#    !pip3 install gym[mujoco]
+#    !pip3 install tqdm
+#
+# Setup
+# -----
+#
+
+# sphinx_gallery_start_ignore
+import warnings
+
+warnings.filterwarnings("ignore")
+from torch import multiprocessing
+
+# TorchRL prefers spawn method, that restricts creation of  ``~torchrl.envs.ParallelEnv`` inside
+# `__main__` method call, but for the easy of reading the code switch to fork
+# which is also a default spawn method in Google's Colaboratory
+try:
+    multiprocessing.set_start_method("fork")
+except RuntimeError:
+    pass
+
+# sphinx_gallery_end_ignore
+
+import torch
+import tqdm
+from tensordict.nn import TensorDictModule as Mod, TensorDictSequential as Seq
+from torch import nn
+from torchrl.collectors import SyncDataCollector
+from torchrl.data import LazyMemmapStorage, TensorDictReplayBuffer
+from torchrl.envs import (
+    Compose,
+    ExplorationType,
+    GrayScale,
+    InitTracker,
+    ObservationNorm,
+    Resize,
+    RewardScaling,
+    set_exploration_type,
+    StepCounter,
+    ToTensorImage,
+    TransformedEnv,
+)
+from torchrl.envs.libs.gym import GymEnv
+from torchrl.modules import ConvNet, EGreedyModule, LSTMModule, MLP, QValueModule
+from torchrl.objectives import DQNLoss, SoftUpdate
+
+is_fork = multiprocessing.get_start_method() == "fork"
+device = (
+    torch.device(0)
+    if torch.cuda.is_available() and not is_fork
+    else torch.device("cpu")
+)
+
+######################################################################
+# Environment
+# -----------
+#
+# As usual, the first step is to build our environment: it helps us
+# define the problem and build the policy network accordingly. For this tutorial,
+# we'll be running a single pixel-based instance of the CartPole gym
+# environment with some custom transforms: turning to grayscale, resizing to
+# 84x84, scaling down the rewards and normalizing the observations.
+#
+# .. note::
+#   The :class:`~torchrl.envs.transforms.StepCounter` transform is accessory. Since the CartPole
+#   task goal is to make trajectories as long as possible, counting the steps
+#   can help us track the performance of our policy.
+#
+# Two transforms are important for the purpose of this tutorial:
+#
+# - :class:`~torchrl.envs.transforms.InitTracker` will stamp the
+#   calls to :meth:`~torchrl.envs.EnvBase.reset` by adding a ``"is_init"``
+#   boolean mask in the TensorDict that will track which steps require a reset
+#   of the RNN hidden states.
+# - The :class:`~torchrl.envs.transforms.TensorDictPrimer` transform is a bit more
+#   technical. It is not required to use RNN policies. However, it
+#   instructs the environment (and subsequently the collector) that some extra
+#   keys are to be expected. Once added, a call to `env.reset()` will populate
+#   the entries indicated in the primer with zeroed tensors. Knowing that
+#   these tensors are expected by the policy, the collector will pass them on
+#   during collection. Eventually, we'll be storing our hidden states in the
+#   replay buffer, which will help us bootstrap the computation of the
+#   RNN operations in the loss module (which would otherwise be initiated
+#   with 0s). In summary: not including this transform will not impact hugely
+#   the training of our policy, but it will make the recurrent keys disappear
+#   from the collected data and the replay buffer, which will in turn lead to
+#   a slightly less optimal training.
+#   Fortunately, the :class:`~torchrl.modules.LSTMModule` we propose is
+#   equipped with a helper method to build just that transform for us, so
+#   we can wait until we build it!
+#
+
+env = TransformedEnv(
+    GymEnv("CartPole-v1", from_pixels=True, device=device),
+    Compose(
+        ToTensorImage(),
+        GrayScale(),
+        Resize(84, 84),
+        StepCounter(),
+        InitTracker(),
+        RewardScaling(loc=0.0, scale=0.1),
+        ObservationNorm(standard_normal=True, in_keys=["pixels"]),
+    ),
+)
+
+######################################################################
+# As always, we need to initialize manually our normalization constants:
+#
+env.transform[-1].init_stats(1000, reduce_dim=[0, 1, 2], cat_dim=0, keep_dims=[0])
+td = env.reset()
+
+######################################################################
+# Policy
+# ------
+#
+# Our policy will have 3 components: a :class:`~torchrl.modules.ConvNet`
+# backbone, an :class:`~torchrl.modules.LSTMModule` memory layer and a shallow
+# :class:`~torchrl.modules.MLP` block that will map the LSTM output onto the
+# action values.
+#
+# Convolutional network
+# ~~~~~~~~~~~~~~~~~~~~~
+#
+# We build a convolutional network flanked with a :class:`torch.nn.AdaptiveAvgPool2d`
+# that will squash the output in a vector of size 64. The :class:`~torchrl.modules.ConvNet`
+# can assist us with this:
+#
+
+feature = Mod(
+    ConvNet(
+        num_cells=[32, 32, 64],
+        squeeze_output=True,
+        aggregator_class=nn.AdaptiveAvgPool2d,
+        aggregator_kwargs={"output_size": (1, 1)},
+        device=device,
+    ),
+    in_keys=["pixels"],
+    out_keys=["embed"],
+)
+######################################################################
+# we execute the first module on a batch of data to gather the size of the
+# output vector:
+#
+n_cells = feature(env.reset())["embed"].shape[-1]
+
+######################################################################
+# LSTM Module
+# ~~~~~~~~~~~
+#
+# TorchRL provides a specialized :class:`~torchrl.modules.LSTMModule` class
+# to incorporate LSTMs in your code-base. It is a :class:`~tensordict.nn.TensorDictModuleBase`
+# subclass: as such, it has a set of ``in_keys`` and ``out_keys`` that indicate
+# what values should be expected to be read and written/updated during the
+# execution of the module. The class comes with customizable predefined
+# values for these attributes to facilitate its construction.
+#
+# .. note::
+#   *Usage limitations*: The class supports almost all LSTM features such as
+#   dropout or multi-layered LSTMs.
+#   However, to respect TorchRL's conventions, this LSTM must have the ``batch_first``
+#   attribute set to ``True`` which is **not** the default in PyTorch. However,
+#   our :class:`~torchrl.modules.LSTMModule` changes this default
+#   behavior, so we're good with a native call.
+#
+#   Also, the LSTM cannot have a ``bidirectional`` attribute set to ``True`` as
+#   this wouldn't be usable in online settings. In this case, the default value
+#   is the correct one.
+#
+
+lstm = LSTMModule(
+    input_size=n_cells,
+    hidden_size=128,
+    device=device,
+    in_key="embed",
+    out_key="embed",
+)
+
+######################################################################
+# Let us look at the LSTM Module class, specifically its in and out_keys:
+print("in_keys", lstm.in_keys)
+print("out_keys", lstm.out_keys)
+
+######################################################################
+# We can see that these values contain the key we indicated as the in_key (and out_key)
+# as well as recurrent key names. The out_keys are preceded by a "next" prefix
+# that indicates that they will need to be written in the "next" TensorDict.
+# We use this convention (which can be overridden by passing the in_keys/out_keys
+# arguments) to make sure that a call to :func:`~torchrl.envs.utils.step_mdp` will
+# move the recurrent state to the root TensorDict, making it available to the
+# RNN during the following call (see figure in the intro).
+#
+# As mentioned earlier, we have one more optional transform to add to our
+# environment to make sure that the recurrent states are passed to the buffer.
+# The :meth:`~torchrl.modules.LSTMModule.make_tensordict_primer` method does
+# exactly that:
+#
+env.append_transform(lstm.make_tensordict_primer())
+
+######################################################################
+# and that's it! We can print the environment to check that everything looks good now
+# that we have added the primer:
+print(env)
+
+######################################################################
+# MLP
+# ~~~
+#
+# We use a single-layer MLP to represent the action values we'll be using for
+# our policy.
+#
+mlp = MLP(
+    out_features=2,
+    num_cells=[
+        64,
+    ],
+    device=device,
+)
+######################################################################
+# and fill the bias with zeros:
+
+mlp[-1].bias.data.fill_(0.0)
+mlp = Mod(mlp, in_keys=["embed"], out_keys=["action_value"])
+
+######################################################################
+# Using the Q-Values to select an action
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# The last part of our policy is the Q-Value Module.
+# The Q-Value module :class:`~torchrl.modules.tensordict_module.QValueModule`
+# will read the ``"action_values"`` key that is produced by our MLP and
+# from it, gather the action that has the maximum value.
+# The only thing we need to do is to specify the action space, which can be done
+# either by passing a string or an action-spec. This allows us to use
+# Categorical (sometimes called "sparse") encoding or the one-hot version of it.
+#
+qval = QValueModule(spec=env.action_spec)
+
+######################################################################
+# .. note::
+#   TorchRL also provides a wrapper class :class:`torchrl.modules.QValueActor` that
+#   wraps a module in a Sequential together with a :class:`~torchrl.modules.tensordict_module.QValueModule`
+#   like we are doing explicitly here. There is little advantage to do this
+#   and the process is less transparent, but the end results will be similar to
+#   what we do here.
+#
+# We can now put things together in a :class:`~tensordict.nn.TensorDictSequential`
+#
+stoch_policy = Seq(feature, lstm, mlp, qval)
+
+######################################################################
+# DQN being a deterministic algorithm, exploration is a crucial part of it.
+# We'll be using an :math:`\epsilon`-greedy policy with an epsilon of 0.2 decaying
+# progressively to 0.
+# This decay is achieved via a call to :meth:`~torchrl.modules.EGreedyModule.step`
+# (see training loop below).
+#
+exploration_module = EGreedyModule(
+    annealing_num_steps=1_000_000, spec=env.action_spec, eps_init=0.2
+)
+stoch_policy = Seq(
+    stoch_policy,
+    exploration_module,
+)
+
+######################################################################
+# Using the model for the loss
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# The model as we've built it is well equipped to be used in sequential settings.
+# However, the class :class:`torch.nn.LSTM` can use a cuDNN-optimized backend
+# to run the RNN sequence faster on GPU device. We would not want to miss
+# such an opportunity to speed up our training loop!
+# To use it, we just need to tell the LSTM module to run on "recurrent-mode"
+# when used by the loss.
+# As we'll usually want to have two copies of the LSTM module, we do this by
+# calling a :meth:`~torchrl.modules.LSTMModule.set_recurrent_mode` method that
+# will return a new instance of the LSTM (with shared weights) that will
+# assume that the input data is sequential in nature.
+#
+policy = Seq(feature, lstm.set_recurrent_mode(True), mlp, qval)
+
+######################################################################
+# Because we still have a couple of uninitialized parameters we should
+# initialize them before creating an optimizer and such.
+#
+policy(env.reset())
+
+######################################################################
+# DQN Loss
+# --------
+#
+# Out DQN loss requires us to pass the policy and, again, the action-space.
+# While this may seem redundant, it is important as we want to make sure that
+# the :class:`~torchrl.objectives.DQNLoss` and the :class:`~torchrl.modules.tensordict_module.QValueModule`
+# classes are compatible, but aren't strongly dependent on each other.
+#
+# To use the Double-DQN, we ask for a ``delay_value`` argument that will
+# create a non-differentiable copy of the network parameters to be used
+# as a target network.
+loss_fn = DQNLoss(policy, action_space=env.action_spec, delay_value=True)
+
+######################################################################
+# Since we are using a double DQN, we need to update the target parameters.
+# We'll use a  :class:`~torchrl.objectives.SoftUpdate` instance to carry out
+# this work.
+#
+updater = SoftUpdate(loss_fn, eps=0.95)
+
+optim = torch.optim.Adam(policy.parameters(), lr=3e-4)
+
+######################################################################
+# Collector and replay buffer
+# ---------------------------
+#
+# We build the simplest data collector there is. We'll try to train our algorithm
+# with a million frames, extending the buffer with 50 frames at a time. The buffer
+# will be designed to store 20 thousands trajectories of 50 steps each.
+# At each optimization step (16 per data collection), we'll collect 4 items
+# from our buffer, for a total of 200 transitions.
+# We'll use a :class:`~torchrl.data.replay_buffers.LazyMemmapStorage` storage to keep the data
+# on disk.
+#
+# .. note::
+#   For the sake of efficiency, we're only running a few thousands iterations
+#   here. In a real setting, the total number of frames should be set to 1M.
+#
+collector = SyncDataCollector(env, stoch_policy, frames_per_batch=50, total_frames=200, device=device)
+rb = TensorDictReplayBuffer(
+    storage=LazyMemmapStorage(20_000), batch_size=4, prefetch=10
+)
+
+######################################################################
+# Training loop
+# -------------
+#
+# To keep track of the progress, we will run the policy in the environment once
+# every 50 data collection, and plot the results after training.
+#
+
+utd = 16
+pbar = tqdm.tqdm(total=1_000_000)
+longest = 0
+
+traj_lens = []
+for i, data in enumerate(collector):
+    if i == 0:
+        print(
+            "Let us print the first batch of data.\nPay attention to the key names "
+            "which will reflect what can be found in this data structure, in particular: "
+            "the output of the QValueModule (action_values, action and chosen_action_value),"
+            "the 'is_init' key that will tell us if a step is initial or not, and the "
+            "recurrent_state keys.\n",
+            data,
+        )
+    pbar.update(data.numel())
+    # it is important to pass data that is not flattened
+    rb.extend(data.unsqueeze(0).to_tensordict().cpu())
+    for _ in range(utd):
+        s = rb.sample().to(device, non_blocking=True)
+        loss_vals = loss_fn(s)
+        loss_vals["loss"].backward()
+        optim.step()
+        optim.zero_grad()
+    longest = max(longest, data["step_count"].max().item())
+    pbar.set_description(
+        f"steps: {longest}, loss_val: {loss_vals['loss'].item(): 4.4f}, action_spread: {data['action'].sum(0)}"
+    )
+    exploration_module.step(data.numel())
+    updater.step()
+
+    with set_exploration_type(ExplorationType.DETERMINISTIC), torch.no_grad():
+        rollout = env.rollout(10000, stoch_policy)
+        traj_lens.append(rollout.get(("next", "step_count")).max().item())
+
+######################################################################
+# Let's plot our results:
+#
+if traj_lens:
+    from matplotlib import pyplot as plt
+
+    plt.plot(traj_lens)
+    plt.xlabel("Test collection")
+    plt.title("Test trajectory lengths")
+
+######################################################################
+# Conclusion
+# ----------
+#
+# We have seen how an RNN can be incorporated in a policy in TorchRL.
+# You should now be able:
+#
+# - Create an LSTM module that acts as a :class:`~tensordict.nn.TensorDictModule`
+# - Indicate to the LSTM module that a reset is needed via an :class:`~torchrl.envs.transforms.InitTracker`
+#   transform
+# - Incorporate this module in a policy and in a loss module
+# - Make sure that the collector is made aware of the recurrent state entries
+#   such that they can be stored in the replay buffer along with the rest of
+#   the data
+#
+# Further Reading
+# ---------------
+# 
+# - The TorchRL documentation can be found `here <https://pytorch.org/rl/>`_.
diff --git a/intermediate_source/ensembling.py b/intermediate_source/ensembling.py
new file mode 100644
index 00000000000..9199daf13a3
--- /dev/null
+++ b/intermediate_source/ensembling.py
@@ -0,0 +1,175 @@
+# -*- coding: utf-8 -*-
+"""
+Model ensembling
+================
+
+This tutorial illustrates how to vectorize model ensembling using ``torch.vmap``.
+
+What is model ensembling?
+-------------------------
+Model ensembling combines the predictions from multiple models together.
+Traditionally this is done by running each model on some inputs separately
+and then combining the predictions. However, if you're running models with
+the same architecture, then it may be possible to combine them together
+using ``torch.vmap``. ``vmap`` is a function transform that maps functions across
+dimensions of the input tensors. One of its use cases is eliminating
+for-loops and speeding them up through vectorization.
+
+Let's demonstrate how to do this using an ensemble of simple MLPs.
+
+.. note::
+
+   This tutorial requires PyTorch 2.0.0 or later.
+"""
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+torch.manual_seed(0)
+
+# Here's a simple MLP
+class SimpleMLP(nn.Module):
+    def __init__(self):
+        super(SimpleMLP, self).__init__()
+        self.fc1 = nn.Linear(784, 128)
+        self.fc2 = nn.Linear(128, 128)
+        self.fc3 = nn.Linear(128, 10)
+
+    def forward(self, x):
+        x = x.flatten(1)
+        x = self.fc1(x)
+        x = F.relu(x)
+        x = self.fc2(x)
+        x = F.relu(x)
+        x = self.fc3(x)
+        return x
+
+######################################################################
+# Let’s generate a batch of dummy data and pretend that we’re working with
+# an MNIST dataset. Thus, the dummy images are 28 by 28, and we have a
+# minibatch of size 64. Furthermore, lets say we want to combine the predictions
+# from 10 different models.
+
+device = 'cuda'
+num_models = 10
+
+data = torch.randn(100, 64, 1, 28, 28, device=device)
+targets = torch.randint(10, (6400,), device=device)
+
+models = [SimpleMLP().to(device) for _ in range(num_models)]
+
+######################################################################
+# We have a couple of options for generating predictions. Maybe we want to
+# give each model a different randomized minibatch of data. Alternatively,
+# maybe we want to run the same minibatch of data through each model (e.g.
+# if we were testing the effect of different model initializations).
+
+######################################################################
+# Option 1: different minibatch for each model
+
+minibatches = data[:num_models]
+predictions_diff_minibatch_loop = [model(minibatch) for model, minibatch in zip(models, minibatches)]
+
+######################################################################
+# Option 2: Same minibatch
+
+minibatch = data[0]
+predictions2 = [model(minibatch) for model in models]
+
+######################################################################
+# Using ``vmap`` to vectorize the ensemble
+# ----------------------------------------
+#
+# Let's use ``vmap`` to speed up the for-loop. We must first prepare the models
+# for use with ``vmap``.
+#
+# First, let’s combine the states of the model together by stacking each
+# parameter. For example, ``model[i].fc1.weight`` has shape ``[784, 128]``; we are
+# going to stack the ``.fc1.weight`` of each of the 10 models to produce a big
+# weight of shape ``[10, 784, 128]``.
+#
+# PyTorch offers the ``torch.func.stack_module_state`` convenience function to do
+# this.
+from torch.func import stack_module_state
+
+params, buffers = stack_module_state(models)
+
+######################################################################
+# Next, we need to define a function to ``vmap`` over. The function should,
+# given parameters and buffers and inputs, run the model using those
+# parameters, buffers, and inputs. We'll use ``torch.func.functional_call``
+# to help out:
+
+from torch.func import functional_call
+import copy
+
+# Construct a "stateless" version of one of the models. It is "stateless" in
+# the sense that the parameters are meta Tensors and do not have storage.
+base_model = copy.deepcopy(models[0])
+base_model = base_model.to('meta')
+
+def fmodel(params, buffers, x):
+    return functional_call(base_model, (params, buffers), (x,))
+
+######################################################################
+# Option 1: get predictions using a different minibatch for each model.
+#
+# By default, ``vmap`` maps a function across the first dimension of all inputs to
+# the passed-in function. After using ``stack_module_state``, each of
+# the ``params`` and buffers have an additional dimension of size 'num_models' at
+# the front, and minibatches has a dimension of size 'num_models'.
+
+print([p.size(0) for p in params.values()]) # show the leading 'num_models' dimension
+
+assert minibatches.shape == (num_models, 64, 1, 28, 28) # verify minibatch has leading dimension of size 'num_models'
+
+from torch import vmap
+
+predictions1_vmap = vmap(fmodel)(params, buffers, minibatches)
+
+# verify the ``vmap`` predictions match the
+assert torch.allclose(predictions1_vmap, torch.stack(predictions_diff_minibatch_loop), atol=1e-3, rtol=1e-5)
+
+######################################################################
+# Option 2: get predictions using the same minibatch of data.
+#
+# ``vmap`` has an ``in_dims`` argument that specifies which dimensions to map over.
+# By using ``None``, we tell ``vmap`` we want the same minibatch to apply for all of
+# the 10 models.
+
+predictions2_vmap = vmap(fmodel, in_dims=(0, 0, None))(params, buffers, minibatch)
+
+assert torch.allclose(predictions2_vmap, torch.stack(predictions2), atol=1e-3, rtol=1e-5)
+
+######################################################################
+# A quick note: there are limitations around what types of functions can be
+# transformed by ``vmap``. The best functions to transform are ones that are pure
+# functions: a function where the outputs are only determined by the inputs
+# that have no side effects (e.g. mutation). ``vmap`` is unable to handle mutation
+# of arbitrary Python data structures, but it is able to handle many in-place
+# PyTorch operations.
+
+######################################################################
+# Performance
+# -----------
+# Curious about performance numbers? Here's how the numbers look.
+
+from torch.utils.benchmark import Timer
+without_vmap = Timer(
+    stmt="[model(minibatch) for model, minibatch in zip(models, minibatches)]",
+    globals=globals())
+with_vmap = Timer(
+    stmt="vmap(fmodel)(params, buffers, minibatches)",
+    globals=globals())
+print(f'Predictions without vmap {without_vmap.timeit(100)}')
+print(f'Predictions with vmap {with_vmap.timeit(100)}')
+
+######################################################################
+# There's a large speedup using ``vmap``!
+#
+# In general, vectorization with ``vmap`` should be faster than running a function
+# in a for-loop and competitive with manual batching. There are some exceptions
+# though, like if we haven’t implemented the ``vmap`` rule for a particular
+# operation or if the underlying kernels weren’t optimized for older hardware
+# (GPUs). If you see any of these cases, please let us know by opening an issue
+# on GitHub.
diff --git a/intermediate_source/forced_alignment_with_torchaudio_tutorial.rst b/intermediate_source/forced_alignment_with_torchaudio_tutorial.rst
new file mode 100644
index 00000000000..4c9752d016d
--- /dev/null
+++ b/intermediate_source/forced_alignment_with_torchaudio_tutorial.rst
@@ -0,0 +1,11 @@
+Forced Alignment with Wav2Vec2
+==============================
+
+This tutorial has been moved to https://pytorch.org/audio/stable/tutorials/forced_alignment_tutorial.html
+
+It will redirect in 3 seconds.
+
+.. raw:: html
+
+   <meta http-equiv="Refresh" content="3; url='https://pytorch.org/audio/stable/tutorials/forced_alignment_tutorial.html'" />
+
diff --git a/intermediate_source/forward_ad_usage.py b/intermediate_source/forward_ad_usage.py
new file mode 100644
index 00000000000..10965d64ab9
--- /dev/null
+++ b/intermediate_source/forward_ad_usage.py
@@ -0,0 +1,246 @@
+# -*- coding: utf-8 -*-
+"""
+Forward-mode Automatic Differentiation (Beta)
+=============================================
+
+This tutorial demonstrates how to use forward-mode AD to compute
+directional derivatives (or equivalently, Jacobian-vector products).
+
+The tutorial below uses some APIs only available in versions >= 1.11
+(or nightly builds).
+
+Also note that forward-mode AD is currently in beta. The API is
+subject to change and operator coverage is still incomplete.
+
+Basic Usage
+--------------------------------------------------------------------
+Unlike reverse-mode AD, forward-mode AD computes gradients eagerly
+alongside the forward pass. We can use forward-mode AD to compute a
+directional derivative by performing the forward pass as before,
+except we first associate our input with another tensor representing
+the direction of the directional derivative (or equivalently, the ``v``
+in a Jacobian-vector product). When an input, which we call "primal", is
+associated with a "direction" tensor, which we call "tangent", the
+resultant new tensor object is called a "dual tensor" for its connection
+to dual numbers[0].
+
+As the forward pass is performed, if any input tensors are dual tensors,
+extra computation is performed to propagate this "sensitivity" of the
+function.
+
+"""
+
+import torch
+import torch.autograd.forward_ad as fwAD
+
+primal = torch.randn(10, 10)
+tangent = torch.randn(10, 10)
+
+def fn(x, y):
+    return x ** 2 + y ** 2
+
+# All forward AD computation must be performed in the context of
+# a ``dual_level`` context. All dual tensors created in such a context
+# will have their tangents destroyed upon exit. This is to ensure that
+# if the output or intermediate results of this computation are reused
+# in a future forward AD computation, their tangents (which are associated
+# with this computation) won't be confused with tangents from the later
+# computation.
+with fwAD.dual_level():
+    # To create a dual tensor we associate a tensor, which we call the
+    # primal with another tensor of the same size, which we call the tangent.
+    # If the layout of the tangent is different from that of the primal,
+    # The values of the tangent are copied into a new tensor with the same
+    # metadata as the primal. Otherwise, the tangent itself is used as-is.
+    #
+    # It is also important to note that the dual tensor created by
+    # ``make_dual`` is a view of the primal.
+    dual_input = fwAD.make_dual(primal, tangent)
+    assert fwAD.unpack_dual(dual_input).tangent is tangent
+
+    # To demonstrate the case where the copy of the tangent happens,
+    # we pass in a tangent with a layout different from that of the primal
+    dual_input_alt = fwAD.make_dual(primal, tangent.T)
+    assert fwAD.unpack_dual(dual_input_alt).tangent is not tangent
+
+    # Tensors that do not have an associated tangent are automatically
+    # considered to have a zero-filled tangent of the same shape.
+    plain_tensor = torch.randn(10, 10)
+    dual_output = fn(dual_input, plain_tensor)
+
+    # Unpacking the dual returns a ``namedtuple`` with ``primal`` and ``tangent``
+    # as attributes
+    jvp = fwAD.unpack_dual(dual_output).tangent
+
+assert fwAD.unpack_dual(dual_output).tangent is None
+
+######################################################################
+# Usage with Modules
+# --------------------------------------------------------------------
+# To use ``nn.Module`` with forward AD, replace the parameters of your
+# model with dual tensors before performing the forward pass. At the
+# time of writing, it is not possible to create dual tensor
+# `nn.Parameter`s. As a workaround, one must register the dual tensor
+# as a non-parameter attribute of the module.
+
+import torch.nn as nn
+
+model = nn.Linear(5, 5)
+input = torch.randn(16, 5)
+
+params = {name: p for name, p in model.named_parameters()}
+tangents = {name: torch.rand_like(p) for name, p in params.items()}
+
+with fwAD.dual_level():
+    for name, p in params.items():
+        delattr(model, name)
+        setattr(model, name, fwAD.make_dual(p, tangents[name]))
+
+    out = model(input)
+    jvp = fwAD.unpack_dual(out).tangent
+
+######################################################################
+# Using the functional Module API (beta)
+# --------------------------------------------------------------------
+# Another way to use ``nn.Module`` with forward AD is to utilize
+# the functional Module API (also known as the stateless Module API).
+
+from torch.func import functional_call
+
+# We need a fresh module because the functional call requires the
+# the model to have parameters registered.
+model = nn.Linear(5, 5)
+
+dual_params = {}
+with fwAD.dual_level():
+    for name, p in params.items():
+        # Using the same ``tangents`` from the above section
+        dual_params[name] = fwAD.make_dual(p, tangents[name])
+    out = functional_call(model, dual_params, input)
+    jvp2 = fwAD.unpack_dual(out).tangent
+
+# Check our results
+assert torch.allclose(jvp, jvp2)
+
+######################################################################
+# Custom autograd Function
+# --------------------------------------------------------------------
+# Custom Functions also support forward-mode AD. To create custom Function
+# supporting forward-mode AD, register the ``jvp()`` static method. It is
+# possible, but not mandatory for custom Functions to support both forward
+# and backward AD. See the
+# `documentation <https://pytorch.org/docs/master/notes/extending.html#forward-mode-ad>`_
+# for more information.
+
+class Fn(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, foo):
+        result = torch.exp(foo)
+        # Tensors stored in ``ctx`` can be used in the subsequent forward grad
+        # computation.
+        ctx.result = result
+        return result
+
+    @staticmethod
+    def jvp(ctx, gI):
+        gO = gI * ctx.result
+        # If the tensor stored in`` ctx`` will not also be used in the backward pass,
+        # one can manually free it using ``del``
+        del ctx.result
+        return gO
+
+fn = Fn.apply
+
+primal = torch.randn(10, 10, dtype=torch.double, requires_grad=True)
+tangent = torch.randn(10, 10)
+
+with fwAD.dual_level():
+    dual_input = fwAD.make_dual(primal, tangent)
+    dual_output = fn(dual_input)
+    jvp = fwAD.unpack_dual(dual_output).tangent
+
+# It is important to use ``autograd.gradcheck`` to verify that your
+# custom autograd Function computes the gradients correctly. By default,
+# ``gradcheck`` only checks the backward-mode (reverse-mode) AD gradients. Specify
+# ``check_forward_ad=True`` to also check forward grads. If you did not
+# implement the backward formula for your function, you can also tell ``gradcheck``
+# to skip the tests that require backward-mode AD by specifying
+# ``check_backward_ad=False``, ``check_undefined_grad=False``, and
+# ``check_batched_grad=False``.
+torch.autograd.gradcheck(Fn.apply, (primal,), check_forward_ad=True,
+                         check_backward_ad=False, check_undefined_grad=False,
+                         check_batched_grad=False)
+
+######################################################################
+# Functional API (beta)
+# --------------------------------------------------------------------
+# We also offer a higher-level functional API in functorch
+# for computing Jacobian-vector products that you may find simpler to use
+# depending on your use case.
+#
+# The benefit of the functional API is that there isn't a need to understand
+# or use the lower-level dual tensor API and that you can compose it with
+# other `functorch transforms (like vmap) <https://pytorch.org/functorch/stable/notebooks/jacobians_hessians.html>`_;
+# the downside is that it offers you less control.
+#
+# Note that the remainder of this tutorial will require functorch
+# (https://github.com/pytorch/functorch) to run. Please find installation
+# instructions at the specified link.
+
+import functorch as ft
+
+primal0 = torch.randn(10, 10)
+tangent0 = torch.randn(10, 10)
+primal1 = torch.randn(10, 10)
+tangent1 = torch.randn(10, 10)
+
+def fn(x, y):
+    return x ** 2 + y ** 2
+
+# Here is a basic example to compute the JVP of the above function.
+# The ``jvp(func, primals, tangents)`` returns ``func(*primals)`` as well as the
+# computed Jacobian-vector product (JVP). Each primal must be associated with a tangent of the same shape.
+primal_out, tangent_out = ft.jvp(fn, (primal0, primal1), (tangent0, tangent1))
+
+# ``functorch.jvp`` requires every primal to be associated with a tangent.
+# If we only want to associate certain inputs to `fn` with tangents,
+# then we'll need to create a new function that captures inputs without tangents:
+primal = torch.randn(10, 10)
+tangent = torch.randn(10, 10)
+y = torch.randn(10, 10)
+
+import functools
+new_fn = functools.partial(fn, y=y)
+primal_out, tangent_out = ft.jvp(new_fn, (primal,), (tangent,))
+
+######################################################################
+# Using the functional API with Modules
+# --------------------------------------------------------------------
+# To use ``nn.Module`` with ``functorch.jvp`` to compute Jacobian-vector products
+# with respect to the model parameters, we need to reformulate the
+# ``nn.Module`` as a function that accepts both the model parameters and inputs
+# to the module.
+
+model = nn.Linear(5, 5)
+input = torch.randn(16, 5)
+tangents = tuple([torch.rand_like(p) for p in model.parameters()])
+
+# Given a ``torch.nn.Module``, ``ft.make_functional_with_buffers`` extracts the state
+# (``params`` and buffers) and returns a functional version of the model that
+# can be invoked like a function.
+# That is, the returned ``func`` can be invoked like
+# ``func(params, buffers, input)``.
+# ``ft.make_functional_with_buffers`` is analogous to the ``nn.Modules`` stateless API
+# that you saw previously and we're working on consolidating the two.
+func, params, buffers = ft.make_functional_with_buffers(model)
+
+# Because ``jvp`` requires every input to be associated with a tangent, we need to
+# create a new function that, when given the parameters, produces the output
+def func_params_only(params):
+    return func(params, buffers, input)
+
+model_output, jvp_out = ft.jvp(func_params_only, (params,), (tangents,))
+
+
+######################################################################
+# [0] https://en.wikipedia.org/wiki/Dual_number
diff --git a/intermediate_source/fx_profiling_tutorial.py b/intermediate_source/fx_profiling_tutorial.py
new file mode 100644
index 00000000000..7f31338d002
--- /dev/null
+++ b/intermediate_source/fx_profiling_tutorial.py
@@ -0,0 +1,233 @@
+# -*- coding: utf-8 -*-
+"""
+(beta) Building a Simple CPU Performance Profiler with FX
+*********************************************************
+**Author**: `James Reed <https://github.com/jamesr66a>`_
+
+In this tutorial, we are going to use FX to do the following:
+
+1) Capture PyTorch Python code in a way that we can inspect and gather
+   statistics about the structure and execution of the code
+2) Build out a small class that will serve as a simple performance "profiler",
+   collecting runtime statistics about each part of the model from actual
+   runs.
+
+"""
+
+######################################################################
+# For this tutorial, we are going to use the torchvision ResNet18 model
+# for demonstration purposes.
+
+import torch
+import torch.fx
+import torchvision.models as models
+
+rn18 = models.resnet18()
+rn18.eval()
+
+######################################################################
+# Now that we have our model, we want to inspect deeper into its
+# performance. That is, for the following invocation, which parts
+# of the model are taking the longest?
+input = torch.randn(5, 3, 224, 224)
+output = rn18(input)
+
+######################################################################
+# A common way of answering that question is to go through the program
+# source, add code that collects timestamps at various points in the
+# program, and compare the difference between those timestamps to see
+# how long the regions between the timestamps take.
+#
+# That technique is certainly applicable to PyTorch code, however it
+# would be nicer if we didn't have to copy over model code and edit it,
+# especially code we haven't written (like this torchvision model).
+# Instead, we are going to use FX to automate this "instrumentation"
+# process without needing to modify any source.
+
+######################################################################
+# First, let's get some imports out of the way (we will be using all
+# of these later in the code).
+
+import statistics, tabulate, time
+from typing import Any, Dict, List
+from torch.fx import Interpreter
+
+######################################################################
+# .. note::
+#     ``tabulate`` is an external library that is not a dependency of PyTorch.
+#     We will be using it to more easily visualize performance data. Please
+#     make sure you've installed it from your favorite Python package source.
+
+######################################################################
+# Capturing the Model with Symbolic Tracing
+# -----------------------------------------
+# Next, we are going to use FX's symbolic tracing mechanism to capture
+# the definition of our model in a data structure we can manipulate
+# and examine.
+
+traced_rn18 = torch.fx.symbolic_trace(rn18)
+print(traced_rn18.graph)
+
+######################################################################
+# This gives us a Graph representation of the ResNet18 model. A Graph
+# consists of a series of Nodes connected to each other. Each Node
+# represents a call-site in the Python code (whether to a function,
+# a module, or a method) and the edges (represented as ``args`` and ``kwargs``
+# on each node) represent the values passed between these call-sites. More
+# information about the Graph representation and the rest of FX's APIs ca
+# be found at the FX documentation https://pytorch.org/docs/master/fx.html.
+
+
+######################################################################
+# Creating a Profiling Interpreter
+# --------------------------------
+# Next, we are going to create a class that inherits from ``torch.fx.Interpreter``.
+# Though the ``GraphModule`` that ``symbolic_trace`` produces compiles Python code
+# that is run when you call a ``GraphModule``, an alternative way to run a
+# ``GraphModule`` is by executing each ``Node`` in the ``Graph`` one by one. That is
+# the functionality that ``Interpreter`` provides: It interprets the graph node-
+# by-node.
+#
+# By inheriting from ``Interpreter``, we can override various functionality and
+# install the profiling behavior we want. The goal is to have an object to which
+# we can pass a model, invoke the model 1 or more times, then get statistics about
+# how long the model and each part of the model took during those runs.
+#
+# Let's define our ``ProfilingInterpreter`` class:
+
+class ProfilingInterpreter(Interpreter):
+    def __init__(self, mod : torch.nn.Module):
+        # Rather than have the user symbolically trace their model,
+        # we're going to do it in the constructor. As a result, the
+        # user can pass in any ``Module`` without having to worry about
+        # symbolic tracing APIs
+        gm = torch.fx.symbolic_trace(mod)
+        super().__init__(gm)
+
+        # We are going to store away two things here:
+        #
+        # 1. A list of total runtimes for ``mod``. In other words, we are
+        #    storing away the time ``mod(...)`` took each time this
+        #    interpreter is called.
+        self.total_runtime_sec : List[float] = []
+        # 2. A map from ``Node`` to a list of times (in seconds) that
+        #    node took to run. This can be seen as similar to (1) but
+        #    for specific sub-parts of the model.
+        self.runtimes_sec : Dict[torch.fx.Node, List[float]] = {}
+
+    ######################################################################
+    # Next, let's override our first method: ``run()``. ``Interpreter``'s ``run``
+    # method is the top-level entry point for execution of the model. We will
+    # want to intercept this so that we can record the total runtime of the
+    # model.
+
+    def run(self, *args) -> Any:
+        # Record the time we started running the model
+        t_start = time.time()
+        # Run the model by delegating back into Interpreter.run()
+        return_val = super().run(*args)
+        # Record the time we finished running the model
+        t_end = time.time()
+        # Store the total elapsed time this model execution took in the
+        # ``ProfilingInterpreter``
+        self.total_runtime_sec.append(t_end - t_start)
+        return return_val
+
+    ######################################################################
+    # Now, let's override ``run_node``. ``Interpreter`` calls ``run_node`` each
+    # time it executes a single node. We will intercept this so that we
+    # can measure and record the time taken for each individual call in
+    # the model.
+
+    def run_node(self, n : torch.fx.Node) -> Any:
+        # Record the time we started running the op
+        t_start = time.time()
+        # Run the op by delegating back into Interpreter.run_node()
+        return_val = super().run_node(n)
+        # Record the time we finished running the op
+        t_end = time.time()
+        # If we don't have an entry for this node in our runtimes_sec
+        # data structure, add one with an empty list value.
+        self.runtimes_sec.setdefault(n, [])
+        # Record the total elapsed time for this single invocation
+        # in the runtimes_sec data structure
+        self.runtimes_sec[n].append(t_end - t_start)
+        return return_val
+
+    ######################################################################
+    # Finally, we are going to define a method (one which doesn't override
+    # any ``Interpreter`` method) that provides us a nice, organized view of
+    # the data we have collected.
+
+    def summary(self, should_sort : bool = False) -> str:
+        # Build up a list of summary information for each node
+        node_summaries : List[List[Any]] = []
+        # Calculate the mean runtime for the whole network. Because the
+        # network may have been called multiple times during profiling,
+        # we need to summarize the runtimes. We choose to use the
+        # arithmetic mean for this.
+        mean_total_runtime = statistics.mean(self.total_runtime_sec)
+
+        # For each node, record summary statistics
+        for node, runtimes in self.runtimes_sec.items():
+            # Similarly, compute the mean runtime for ``node``
+            mean_runtime = statistics.mean(runtimes)
+            # For easier understanding, we also compute the percentage
+            # time each node took with respect to the whole network.
+            pct_total = mean_runtime / mean_total_runtime * 100
+            # Record the node's type, name of the node, mean runtime, and
+            # percent runtime.
+            node_summaries.append(
+                [node.op, str(node), mean_runtime, pct_total])
+
+        # One of the most important questions to answer when doing performance
+        # profiling is "Which op(s) took the longest?". We can make this easy
+        # to see by providing sorting functionality in our summary view
+        if should_sort:
+            node_summaries.sort(key=lambda s: s[2], reverse=True)
+
+        # Use the ``tabulate`` library to create a well-formatted table
+        # presenting our summary information
+        headers : List[str] = [
+            'Op type', 'Op', 'Average runtime (s)', 'Pct total runtime'
+        ]
+        return tabulate.tabulate(node_summaries, headers=headers)
+
+######################################################################
+# .. note::
+#       We use Python's ``time.time`` function to pull wall clock
+#       timestamps and compare them. This is not the most accurate
+#       way to measure performance, and will only give us a first-
+#       order approximation. We use this simple technique only for the
+#       purpose of demonstration in this tutorial.
+
+######################################################################
+# Investigating the Performance of ResNet18
+# -----------------------------------------
+# We can now use ``ProfilingInterpreter`` to inspect the performance
+# characteristics of our ResNet18 model;
+
+interp = ProfilingInterpreter(rn18)
+interp.run(input)
+print(interp.summary(True))
+
+######################################################################
+# There are two things we should call out here:
+#
+# * ``MaxPool2d`` takes up the most time. This is a known issue:
+#   https://github.com/pytorch/pytorch/issues/51393
+#
+#
+# Conclusion
+# ----------
+# As we can see, using FX we can easily capture PyTorch programs (even
+# ones we don't have the source code for!) in a machine-interpretable
+# format and use that for analysis, such as the performance analysis
+# we've done here. FX opens up an exciting world of possibilities for
+# working with PyTorch programs.
+#
+# Finally, since FX is still in beta, we would be happy to hear any
+# feedback you have about using it. Please feel free to use the
+# PyTorch Forums (https://discuss.pytorch.org/) and the issue tracker
+# (https://github.com/pytorch/pytorch/issues) to provide any feedback
+# you might have.
diff --git a/intermediate_source/inductor_debug_cpu.py b/intermediate_source/inductor_debug_cpu.py
new file mode 100644
index 00000000000..4b6d62c0b0d
--- /dev/null
+++ b/intermediate_source/inductor_debug_cpu.py
@@ -0,0 +1,637 @@
+# -*- coding: utf-8 -*-
+
+"""
+Inductor CPU backend debugging and profiling
+============================================
+
+**Authors**: `Xuan Liao <https://github.com/Valentine233>`_, `Haozhe Zhu <https://github.com/zhuhaozhe>`_, `Jiong Gong <https://github.com/jgong5>`_, `Weihan Wang <https://github.com/EikanWang>`_
+"""
+
+#########################################################################
+# Overview
+# --------
+# 
+# PyTorch 2.0 introduced the compilation API called ``torch.compile``. 
+# This new feature offers a significant speedup over eager mode execution through graph-level optimization powered by the default Inductor backend.
+#
+# This tutorial is intended to provide an in-depth introduction on the debugging 
+# and performance profiling on Inductor CPU backend by delving into the intricacies of ``torch.compile``. 
+#
+# Meanwhile, you may also find related tutorials about ``torch.compile`` 
+# around `basic usage <https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html>`_, 
+# comprehensive `troubleshooting <https://pytorch.org/docs/stable/torch.compiler_troubleshooting.html>`_ 
+# and GPU-specific knowledge like `GPU performance profiling <https://pytorch.org/docs/stable/torch.compiler_inductor_profiling.html>`_.
+#
+# We will start debugging with a motivating example that triggers compilation issues and accuracy problems 
+# by demonstrating the process of debugging to pinpoint the problems.
+#
+# By enabling logging and exploring the underlying generated code, 
+# you can learn how to narrow down the failure step by step and finally figure out the route cause.
+#
+# Following that, we will proceed to discuss how to profile the compiled code and, 
+# through a performance comparison with eager mode, 
+# elaborate on the reasons why ``torch.compile`` can provide an additional performance boost compared to its eager counterpart.
+
+
+######################################################################
+# Debugging
+# ---------
+#
+# Here is a simple example to run the ``torch.compile`` using Inductor and compare its result with eager mode:
+
+import torch
+
+def foo1(x1, x2):
+    a = torch.neg(x1)
+    b = torch.maximum(x2, a)
+    y = torch.cat([b], dim=0)
+    return y
+
+x1 = torch.randint(256, (1, 8), dtype=torch.uint8)
+x2 = torch.randint(256, (8390, 8), dtype=torch.uint8)
+
+compiled_foo1 = torch.compile(foo1)
+result = compiled_foo1(x1, x2)
+
+######################################################################
+# The correct implementation of ``neg`` in the ``cpp`` codegen is as follows:
+
+def neg1(x):
+    return f"decltype({x})(-{x})"
+
+######################################################################
+# In order to demonstrate the debugging, we will modify the function to a wrong one later.
+#
+#
+# Get more logging information
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+#
+# No debugging information would be provided if you run this simple example by default. In order to get more useful debugging and logging information, we usually add a ``TORCH_COMPILE_DEBUG`` environment variable like below:
+#
+# .. code-block:: shell
+#
+# 	TORCH_COMPILE_DEBUG=1 python xx.py
+#
+# This would print more debug information in the output logs and also dump the intermediate IRs generated during the codegen process. You can find the dumped file paths in the log like below:
+#
+# .. code-block:: shell
+#
+# 	torch._inductor.debug: [WARNING] model___20 debug trace: /tmp/torchinductor_root/rx/crxfi2ybd7yp5sbj2pnhw33wfhtdw7wumvrobyp5sjvdui5ktjc2.debug
+#
+# In this directory, the following files are saved for debugging purposes:
+#
+# +-----------------------------+----------------------------------------------------------------+
+# | File                        | Description                                                    |
+# +=============================+================================================================+
+# | ``fx_graph_runnable.py``    | Executable FX graph, after decomposition, before pattern match |
+# +-----------------------------+----------------------------------------------------------------+
+# | ``fx_graph_transformed.py`` | Transformed FX graph, after pattern match                      |
+# +-----------------------------+----------------------------------------------------------------+
+# | ``ir_pre_fusion.txt``       | Inductor IR before fusion                                      |
+# +-----------------------------+----------------------------------------------------------------+
+# | ``ir_post_fusion.txt``      | Inductor IR after fusion                                       |
+# +-----------------------------+----------------------------------------------------------------+
+# | ``output_code.py``          | Generated Python code for graph, with C++/Triton kernels       |
+# +-----------------------------+----------------------------------------------------------------+
+#
+# Note that ``fx_graph_runnable.py`` and ``output_code.py`` are both runnable and editable in order to make debugging easier. 
+# Here are the main parts of code extracted from the files and we correlate the C++ generated line with the FX code line.
+#
+# ``fx_graph_runnable``:
+#
+
+def forward1(self, arg0_1, arg1_1):
+    neg = torch.ops.aten.neg.default(arg0_1);  arg0_1 = None
+    maximum = torch.ops.aten.maximum.default(arg1_1, neg);  arg1_1 = neg = None
+    clone = torch.ops.aten.clone.default(maximum);  maximum = None
+    return (clone,)
+
+######################################################################
+# C++ kernel in ``output_code``:
+#
+
+import torch
+from torch._inductor.async_compile import AsyncCompile
+async_compile = AsyncCompile()
+
+cpp_fused_cat_maximum_neg_0 = async_compile.cpp('''
+#include "/tmp/torchinductor_root/gv/cgv6n5aotqjo5w4vknjibhengeycuattfto532hkxpozszcgxr3x.h"
+extern "C" void kernel(const unsigned char* in_ptr0,
+                       const unsigned char* in_ptr1,
+                       unsigned char* out_ptr0)
+{
+    {
+        #pragma GCC ivdep
+        for(long i0=static_cast<long>(0L); i0<static_cast<long>(8390L); i0+=static_cast<long>(1L))
+        {
+            #pragma GCC ivdep
+            for(long i1=static_cast<long>(0L); i1<static_cast<long>(8L); i1+=static_cast<long>(1L))
+            {
+                auto tmp0 = in_ptr0[static_cast<long>(i1 + (8L*i0))];
+                auto tmp1 = in_ptr1[static_cast<long>(i1)];
+                // Corresponding FX code line: neg = torch.ops.aten.neg.default(arg0_1);  arg0_1 = None
+                auto tmp2 = decltype(tmp1)(-tmp1);
+                // Corresponding FX code line: maximum = torch.ops.aten.maximum.default(arg1_1, neg);  arg1_1 = neg = None
+                auto tmp3 = max_propagate_nan(tmp0, tmp2);
+                // Corresponding FX code line: clone = torch.ops.aten.clone.default(maximum);  maximum = None
+                out_ptr0[static_cast<long>(i1 + (8L*i0))] = tmp3;
+            }
+        }
+    }
+}''')
+
+
+######################################################################
+# Determine component of error
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+#
+# When encountering errors or accuracy problems, a straightforward solution to find the bug is to narrow down the problem. The first thing to do is to determine the component where the error occurs. Luckily, it can be simply achieved by changing the backend of ``torch.compile``.
+#
+# +--------------------------------------------+-----------------------------------------+
+# | Code                                       | Description                             |
+# +============================================+=========================================+
+# | ``torch.compile(fn, backend="eager")``     | Enable Dynamo                           |
+# +--------------------------------------------+-----------------------------------------+
+# | ``torch.compile(fn, backend="aot_eager")`` | Enable Dynamo + AOT Autograd            |
+# +--------------------------------------------+-----------------------------------------+
+# | ``torch.compile(fn, backend="inductor")``  | Enable Dynamo + AOT Autograd + Inductor |
+# +--------------------------------------------+-----------------------------------------+
+#
+# If the model can successfully run when the backend is set to ``eager`` or ``aot_eager`` while it fails with ``inductor``, we can narrow down the failure to Inductor.
+#
+#
+# Compilation error
+# ^^^^^^^^^^^^^^^^^
+#
+# As we know, the evolved chain of graph-level optimization is like:
+#
+# .. code-block:: sh
+#
+# 	torch.neg (Python) -> torch.ops.aten.neg.default (within FX graph) -> ops.neg (within IR node) -> tmp2 = -tmp1 (within C++ kernel)
+#
+# If you encounter a compilation error, there is something wrong when compiling C++ kernels in the output code.
+# This type of error indicates that bugs are introduced when lowering IR nodes to output code.
+# The root cause of compilation error is usually shown in the traceback log.
+#
+# For example, the ``neg`` function is modified like this:
+
+def neg2(x):
+    return f"-{x}"
+
+######################################################################
+# The logging gives the following compile error with a rather clear reason.
+#
+# .. code-block::
+#
+#    torch._dynamo.exc.BackendCompilerFailed: backend='inductor' raised:
+#    CppCompileError: C++ compile error
+#    /tmp/torchinductor_root/xg/cxga5tk3b4lkwoxyigrtocjp5s7vc5cg2ikuscf6bk6pjqip2bhx.cpp: In function ‘void kernel(const unsigned char*, const unsigned char*, unsigned char*)’:
+#    /tmp/torchinductor_root/xg/cxga5tk3b4lkwoxyigrtocjp5s7vc5cg2ikuscf6bk6pjqip2bhx.cpp:17:57: error: no matching function for call to ‘max_propagate_nan(unsigned char&, int&)’
+#      17 |                 auto tmp3 = max_propagate_nan(tmp0, tmp2);
+#           |                                                         ^
+#    In file included from /tmp/torchinductor_root/xg/cxga5tk3b4lkwoxyigrtocjp5s7vc5cg2ikuscf6bk6pjqip2bhx.cpp:2:
+#    /tmp/torchinductor_root/gv/cgv6n5aotqjo5w4vknjibhengeycuattfto532hkxpozszcgxr3x.h:27:17: note: candidate: ‘template<class scalar_t> scalar_t max_propagate_nan(scalar_t, scalar_t)’
+#    27 | inline scalar_t max_propagate_nan(scalar_t a, scalar_t b) {
+#         |                 ^~~~~~~~~~~~~~~~~
+#    /tmp/torchinductor_root/gv/cgv6n5aotqjo5w4vknjibhengeycuattfto532hkxpozszcgxr3x.h:27:17: note:   template argument deduction/substitution failed:
+#   /tmp/torchinductor_root/xg/cxga5tk3b4lkwoxyigrtocjp5s7vc5cg2ikuscf6bk6pjqip2bhx.cpp:17:57: note:   deduced conflicting types for parameter ‘scalar_t’ (‘unsigned char’ and ‘int’)
+#    17 |                 auto tmp3 = max_propagate_nan(tmp0, tmp2);
+#         |                                                         ^
+#
+#
+# Let us also see the corresponding C++ kernel in output code and IR node.
+#
+# C++ kernel:
+#
+# .. code:: c
+#
+#     include "/tmp/torchinductor_root/gv/cgv6n5aotqjo5w4vknjibhengeycuattfto532hkxpozszcgxr3x.h"
+#     extern "C" void kernel(const unsigned char* in_ptr0,
+#                         const unsigned char* in_ptr1,
+#                         unsigned char* out_ptr0)
+#     {
+#         {
+#             #pragma GCC ivdep
+#             for(long i0=static_cast<long>(0L); i0<static_cast<long>(8390L); i0+=static_cast<long>(1L))
+#             {
+#                 #pragma GCC ivdep
+#                 for(long i1=static_cast<long>(0L); i1<static_cast<long>(8L); i1+=static_cast<long>(1L))
+#                 {
+#                     auto tmp0 = in_ptr0[static_cast<long>(i1 + (8L*i0))];
+#                     auto tmp1 = in_ptr1[static_cast<long>(i1)];
+#                     auto tmp2 = -tmp1;
+#                     auto tmp3 = max_propagate_nan(tmp0, tmp2);
+#                     out_ptr0[static_cast<long>(i1 + (8L*i0))] = tmp3;
+#                 }
+#             }
+#         }
+#     }
+#
+
+######################################################################
+# IR node:
+#
+# .. code-block:: sh
+#
+#     buf0: SchedulerNode(ComputedBuffer)
+#     buf0.writes = [MemoryDep('buf0', c0, {c0: 67120})]
+#     buf0.unmet_dependencies = []
+#     buf0.met_dependencies = 
+#         [   MemoryDep('arg0_1', c1, {c0: 8390, c1: 8}),
+#             MemoryDep('arg1_1', c0, {c0: 67120})]
+#     buf0.users = [NodeUser(node=OUTPUT, can_inplace=False)]
+#     buf0.group.device = cpu
+#     buf0.group.iteration = ((8390, 8), ())
+#     buf0.sizes = ([8390, 8], [])
+#     class buf0_loop_body:
+#         var_ranges = {z0: 8390, z1: 8}
+#         index0 = 8*z0 + z1
+#         index1 = z1
+#         def body(self, ops):
+#             get_index = self.get_index('index0')
+#             load = ops.load('arg1_1', get_index)
+#             get_index_1 = self.get_index('index1')
+#             load_1 = ops.load('arg0_1', get_index_1)
+#             neg = ops.neg(load_1)
+#             maximum = ops.maximum(load, neg)
+#             get_index_2 = self.get_index('index0')
+#             store = ops.store('buf0', get_index_2, maximum, None)
+#             return store
+#
+
+######################################################################
+# According to the traceback logging, the compilation error is caused by the data type inconsistency of ``max_propagate_nan``'s inputs. 
+# By checking the C++ kernel, we know that ``tmp2`` is no longer ``long`` after doing ``-`` as ``tmp0`` is ``long``.
+# We can easily match ``-`` and ``max_propagate_nan`` in C++ kernel with ``ops.neg`` and ``ops.maximum`` in IR node respectively.
+#
+# Now we successfully find that the root cause is the implementation of ``ops.neg`` in ``cpp`` codegen, which silently changes the data type when doing ``neg``. 
+#
+#
+# Accuracy debugging
+# ^^^^^^^^^^^^^^^^^^^
+#
+# Otherwise, if the model runs with other errors or accuracy problem, you can use the PyTorch debugging tool called `Minifier <https://pytorch.org/functorch/stable/notebooks/minifier.html>`_. 
+#
+# The core idea of ``Minifier`` is to keep removing the nodes and inputs of graph until finding the minimal graph with problem.
+# It helps to automatically generate a minified problematic graph through 4 strategies: truncating suffix, delta debugging, eliminating dead code and removing unused inputs.
+#
+#
+# We will now show the debugging process for the accuracy problem with the help of ``Minifer``. 
+# The accuracy problem refers to the case where the outputs of backends eager and inductor are different. 
+#
+# For instance, we modify the example like this:
+
+from torch._dynamo.utils import same
+
+def foo2(x1, x2):
+    a = torch.neg(x1)
+    b = torch.maximum(x2, a)
+    y = torch.cat([b], dim=0)
+    return y
+
+x1 = torch.randn((1, 8), dtype=torch.float32)
+x2 = torch.randn((8390, 8), dtype=torch.float32)
+
+expected_result = foo2(x1, x2)
+
+compiled_foo2 = torch.compile(foo2)
+actual_result = compiled_foo2(x1, x2)
+
+assert same(expected_result, actual_result) == True
+
+######################################################################
+# And also modify the ``neg`` function:
+
+def neg3(x):
+    return f"decltype({x})(2 * {x})"
+
+######################################################################
+# An accuracy problem would be raised as follows:
+#
+# .. code-block:: sh
+#
+# 	torch._dynamo.utils: [ERROR] Accuracy failed: allclose not within tol=0.0001
+# 	Traceback (most recent call last):
+# 	  File "test_script.py", line 18, in <module>
+# 	    assert same(expected_result, actual_result) == True
+# 	AssertionError
+#
+# To debug an accuracy problem with Minifier, two environment variables are needed:
+#
+# .. code-block:: sh
+#
+#    TORCHDYNAMO_REPRO_AFTER="aot" TORCHDYNAMO_REPRO_LEVEL=4 python xx.py
+#
+# Which gives us logging information that demonstrates the steps of minifying:
+#
+# .. code-block:: sh
+#
+#     Started off with 6 nodes
+#
+#     Trying granularity 2
+#     Strategy: Truncate suffix (G: 2) (6 nodes, 2 inputs)
+#     SUCCESS: Went from 6 to 4 nodes
+#
+#     Trying granularity 4
+#     Strategy: Remove unused inputs (G: 4) (4 nodes, 2 inputs)
+#     SUCCESS: Went from 4 to 3 nodes
+#
+# After running, we get the final minified graph with the target node ``neg``:
+
+def forward2(self, arg0_1):
+    neg = torch.ops.aten.neg.default(arg0_1);  arg0_1 = None
+    return (neg,)
+
+######################################################################
+# For more usage details about Minifier, please refer to `Troubleshooting <https://pytorch.org/docs/stable/torch.compiler_troubleshooting.html>`_.
+
+
+######################################################################
+# Performance profiling
+# ---------------------
+#
+# Within this section, we will demonstrate the process of conducting performance analysis for a model that has been compiled using the Inductor CPU backend.
+# In the example below, we benchmark a Hugging Face Transformer model ``MobileBertForQuestionAnswering`` with both the eager mode and the Inductor graph mode.
+# The execution time and the speedup ratio of Inductor are printed after the benchmark.
+# We use Intel(R) Xeon(R) Platinum 8358 CPU @ 2.60GHz and run benchmark on the first socket to demonstrate the optimization within this section.
+# We set following environment variable as a best practice to benchmark on Intel(R) CPU.
+
+#########################################################
+# .. code-block:: shell
+#
+#     export KMP_BLOCKTIME=1
+#     export KMP_SETTINGS=1
+#     export KMP_AFFINITY=granularity=fine,compact,1,0
+#     export LD_PRELOAD=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}/lib/libiomp5.so:${CONDA_PREFIX:-"$(dirname $(which conda))/../"}/lib/libjemalloc.so
+#     export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:-1,muzzy_decay_ms:-1"
+#     numactl -C 0-31 -m 0 python bench.py
+#
+
+# bench.py
+from transformers import MobileBertForQuestionAnswering
+# Initialize an eager model
+model = MobileBertForQuestionAnswering.from_pretrained("csarron/mobilebert-uncased-squad-v2")
+seq_length = 128
+bs = 128
+vocab_size = model.config.vocab_size
+input = torch.randint(0, vocab_size, (bs, seq_length), dtype=torch.int64)
+input_dict = {"input_ids": input}
+
+# Initialize the inductor model
+compiled_model = torch.compile(model)
+with torch.no_grad():
+    compiled_model(**input_dict)
+
+NUM_ITERS=50
+import timeit
+with torch.no_grad():
+    # warmup
+    for _ in range(10):
+        model(**input_dict)
+    eager_t = timeit.timeit("model(**input_dict)", number=NUM_ITERS, globals=globals())
+
+with torch.no_grad():
+    # warmup
+    for _ in range(10):
+        compiled_model(**input_dict)
+    inductor_t = timeit.timeit("compiled_model(**input_dict)", number=NUM_ITERS, globals=globals())
+# print(f"eager use: {eager_t * 1000 / NUM_ITERS} ms/iter")
+# print(f"inductor use: {inductor_t * 1000 / NUM_ITERS} ms/iter")
+# print(f"speed up ratio: {eager_t / inductor_t}")
+
+
+######################################################################
+# Output:
+#
+# .. code-block:: shell
+#
+#     eager use: 802.1023553796113 ms/iter
+#     inductor use: 339.95180135127157 ms/iter
+#     speed up ratio: 2.359459053287382
+#
+# In our own testing, we find the Inductor CPU backend speed up the model by around 2.355x.
+#
+#
+# Next, let's dive deep into the performance at the operation level to understand where the speed-up comes from.
+# `Pytorch Profiler <https://pytorch.org/tutorials/recipes/recipes/profiler_recipe.html>`_ is a good tool to help us. 
+# Inductor CPU backend has the support to report the time of the fusion kernels to the profiler with the ``enable_kernel_profile`` configuration option:
+
+from torch._inductor import config
+config.cpp.enable_kernel_profile = True
+
+######################################################################
+# Following the steps in `Pytorch Profiler <https://pytorch.org/tutorials/recipes/recipes/profiler_recipe.html>`_
+# We are able to get the profiling table and trace files.
+
+# bench.py
+from torch.profiler import profile, schedule, ProfilerActivity
+RESULT_DIR = "./prof_trace"
+my_schedule = schedule(
+    skip_first=10,
+    wait=5,
+    warmup=5,
+    active=1,
+    repeat=5)
+
+def trace_handler(p):
+    output = p.key_averages().table(sort_by="self_cpu_time_total", row_limit=20)
+    # print(output)
+    p.export_chrome_trace(f"{RESULT_DIR}/{p.step_num}.json")
+
+for _ in range(10):
+    model(**input_dict)  # compiled_model(**input_dict) to get inductor model profiling
+
+total = 0
+with profile(
+    activities=[ProfilerActivity.CPU],
+    schedule=my_schedule,
+    on_trace_ready=trace_handler
+) as p:
+    for _ in range(50):
+        model(**input_dict)  # compiled_model(**input_dict) to get inductor model profiling
+        p.step()
+
+######################################################################
+# We get the following performance profiling table for the eager-mode model (omitting some columns):
+#
+# .. code-block:: shell
+#
+#     -------------------------  ------------  ------------  ------------
+#                          Name   CPU total %     CPU total    # of Calls
+#     -------------------------  ------------  ------------  ------------
+#                   aten::addmm        45.73%     370.814ms           362
+#                     aten::add        19.89%     161.276ms           363
+#                   aten::copy_        14.97%     121.416ms           488
+#                     aten::mul         9.02%      73.154ms           194
+#               aten::clamp_min         8.81%      71.444ms            96
+#                     aten::bmm         5.46%      44.258ms            48
+#                 ProfilerStep*       100.00%     810.920ms             1
+#                     aten::div         2.89%      23.447ms            24
+#                aten::_softmax         1.00%       8.087ms            24
+#                  aten::linear        46.48%     376.888ms           362
+#                   aten::clone         2.77%      22.430ms            98
+#                       aten::t         0.31%       2.502ms           362
+#                    aten::view         0.14%       1.161ms           850
+#               aten::transpose         0.17%       1.377ms           386
+#            aten::index_select         0.12%     952.000us             3
+#                  aten::expand         0.12%     986.000us           458
+#                  aten::matmul         8.31%      67.420ms            48
+#                     aten::cat         0.09%     703.000us             1
+#              aten::as_strided         0.08%     656.000us           963
+#                    aten::relu         8.86%      71.864ms            96
+#     -------------------------  ------------  ------------  ------------
+#     Self CPU time total: 810.920ms
+#
+
+######################################################################
+#
+# Similarly, we also get the table for the compiled model with Inductor (omitting some columns):
+#
+# .. code-block:: shell
+#
+#     -----------------------------------------------  ------------  ------------  ------------
+#                                                Name   CPU total %     CPU total    # of Calls
+#     -----------------------------------------------  ------------  ------------  ------------
+#                                    mkl::_mkl_linear        68.79%     231.573ms           362
+#                                           aten::bmm         8.02%      26.992ms            48
+#                                       ProfilerStep*       100.00%     336.642ms             1
+#       graph_0_cpp_fused_constant_pad_nd_embedding_0         0.27%     915.000us             1
+#                                         aten::empty         0.27%     911.000us           362
+#      graph_0_cpp_fused__mkl_linear_add_mul_relu_151         0.27%     901.000us             1
+#      graph_0_cpp_fused__mkl_linear_add_mul_relu_226         0.27%     899.000us             1
+#      graph_0_cpp_fused__mkl_linear_add_mul_relu_361         0.27%     898.000us             1
+#      graph_0_cpp_fused__mkl_linear_add_mul_relu_121         0.27%     895.000us             1
+#       graph_0_cpp_fused__mkl_linear_add_mul_relu_31         0.27%     893.000us             1
+#       graph_0_cpp_fused__mkl_linear_add_mul_relu_76         0.26%     892.000us             1
+#      graph_0_cpp_fused__mkl_linear_add_mul_relu_256         0.26%     892.000us             1
+#      graph_0_cpp_fused__mkl_linear_add_mul_relu_346         0.26%     892.000us             1
+#      graph_0_cpp_fused__mkl_linear_add_mul_relu_241         0.26%     891.000us             1
+#      graph_0_cpp_fused__mkl_linear_add_mul_relu_316         0.26%     891.000us             1
+#       graph_0_cpp_fused__mkl_linear_add_mul_relu_91         0.26%     890.000us             1
+#      graph_0_cpp_fused__mkl_linear_add_mul_relu_106         0.26%     890.000us             1
+#      graph_0_cpp_fused__mkl_linear_add_mul_relu_211         0.26%     890.000us             1
+#       graph_0_cpp_fused__mkl_linear_add_mul_relu_61         0.26%     889.000us             1
+#      graph_0_cpp_fused__mkl_linear_add_mul_relu_286         0.26%     889.000us             1
+#     -----------------------------------------------  ------------  ------------  ------------
+#     Self CPU time total: 336.642ms 
+#
+# From the profiling table of the eager model, we can see the most time consumption ops are [``aten::addmm``, ``aten::add``, ``aten::copy_``, ``aten::mul``, ``aten::clamp_min``, ``aten::bmm``].
+# Comparing with the inductor model profiling table, we notice an ``mkl::_mkl_linear`` entry and multiple fused kernels in the form ``graph_0_cpp_fused_*``. They are the major
+# optimizations that the inductor model is doing. Let us discuss them separately.
+#
+# (1) Regarding ``mkl::_mkl_linear``: You may notice the number of calls to this kernel is 362, which is exactly the same as ``aten::linear`` in the eager model profiling table.
+# The CPU total of ``aten::linear`` is 376.888ms, while it is 231.573ms for ``mkl::_mkl_linear``. This suggests a ~1.63x for the "linear" part.
+# The speedup mainly comes from `packing the weight tensor to block memory format <https://www.intel.com/content/www/us/en/docs/onemkl/developer-reference-c/2023-1/cblas-gemm-pack-002.html>`_
+# and invoking `cblas_sgemm_compute <https://www.intel.com/content/www/us/en/docs/onemkl/developer-reference-c/2023-1/cblas-gemm-compute-002.html>`_ within the Inductor CPU backend
+# to have a better cache behavior during GEMM computation.
+#
+# (2) Regarding other memory-intensive ops: The end-to-end latency for the eager/inductor model is 802/339ms in our testing. So we can roughly infer that the speed up for the other memory-intensive ops is around 3.94x.
+# Let's read the generated code to understand how the inductor achieves this impressive optimization. You can find the generated code by 
+# searching ``cpp_fused__mkl_linear_add_mul_relu_151`` in ``output_code.py``
+# 
+
+
+cpp_fused__mkl_linear_add_mul_relu_151 = async_compile.cpp('''
+#include <ATen/record_function.h>
+#include "/tmp/torchinductor_root/lr/clrlgu27q4ggd472umdzwsu6qcpqxcuusjxqvx2hwitjbujiiz7z.h"
+extern "C" void kernel(float* in_out_ptr0,
+                       const float* in_ptr0,
+                       const float* in_ptr1,
+                       const float* in_ptr2,
+                       const float* in_ptr3)
+{
+    RECORD_FUNCTION("graph_0_cpp_fused__mkl_linear_add_mul_relu_151", c10::ArrayRef<c10::IValue>({}));
+    #pragma omp parallel num_threads(32)
+    {
+        {
+            #pragma omp for 
+            for(long i0=static_cast<long>(0L); i0<static_cast<long>(16384L); i0+=static_cast<long>(1L))
+            {
+                for(long i1=static_cast<long>(0L); i1<static_cast<long>(512L); i1+=static_cast<long>(8L))
+                {
+                    auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<long>(i1 + (512L*i0)));
+                    auto tmp1 = at::vec::Vectorized<float>::loadu(in_ptr1 + static_cast<long>(i1));
+                    auto tmp3 = at::vec::Vectorized<float>::loadu(in_out_ptr0 + static_cast<long>(i1 + (512L*i0)));
+                    auto tmp5 = at::vec::Vectorized<float>::loadu(in_ptr2 + static_cast<long>(i1));
+                    auto tmp7 = at::vec::Vectorized<float>::loadu(in_ptr3 + static_cast<long>(i1));
+                    auto tmp2 = tmp0 + tmp1;
+                    auto tmp4 = tmp2 + tmp3;
+                    auto tmp6 = tmp4 * tmp5;
+                    auto tmp8 = tmp6 + tmp7;
+                    tmp8.store(in_out_ptr0 + static_cast<long>(i1 + (512L*i0)));
+                }
+            }
+        }
+    }
+}''')
+
+######################################################################
+# From the generated code above, we can see this kernel has done a typical `Loop Fusion <https://en.wikipedia.org/wiki/Loop_fission_and_fusion>`_ on ``[add, add, mul, add]``.
+# This is a memory-bound bottle neck preventing good performance. To get a more intuitive feeling about this optimization, 
+# we can infer the sizes and stride of the inputs and further benchmark this ``[add, add, mul, add]`` pattern.
+
+# bench.py
+def func(arg_0, arg_1, arg_2, arg_3, arg_4):
+    add_0 = arg_0 + arg_1
+    add_1 = add_0 + arg_2
+    mul_1 = add_1 * arg_3
+    add_2 = mul_1 + arg_4
+    arg_2 = add_2
+    return arg_2
+
+arg_0 = torch.rand(16384, 512)
+arg_1 = torch.rand(1, 512)
+arg_2 = torch.zeros(16384, 512)
+arg_3 = torch.rand(1, 512)
+arg_4 = torch.rand(1, 512)
+
+input = (arg_0, arg_1, arg_2, arg_3, arg_4)
+inductor_func = torch.compile(func)
+with torch.no_grad():
+    inductor_func(*input)
+
+import timeit
+NUM_ITERS=100
+with torch.no_grad():
+    # warmup
+    for _ in range(10):
+        func(*input)
+    eager_t = timeit.timeit("func(*input)", number=NUM_ITERS, globals=globals())
+
+with torch.no_grad():
+    # warmup
+    for _ in range(10):
+        inductor_func(*input)
+    inductor_t = timeit.timeit("inductor_func(*input)", number=NUM_ITERS, globals=globals())
+# print(f"eager use: {eager_t * 1000 / NUM_ITERS} ms/iter")
+# print(f"inductor use: {inductor_t * 1000 / NUM_ITERS} ms/iter")
+# print(f"speed up ratio: {eager_t / inductor_t}")
+
+######################################################################
+# Output:
+#
+# .. code-block:: shell
+#
+#     eager use: 5.780875144992024 ms/iter
+#     inductor use: 0.9588955780491233 ms/iter
+#     speed up ratio: 6.0286805751604735
+#
+#
+# This is just an example. The profiling table shows all element-wise op are fused within the inductor automatically in this model. You can read more kernels in
+# `output_code.py`
+
+
+#########################################################################
+# Conclusion
+# ----------
+#
+# The document gives an in-depth tutorial for the Inductor CPU backend.
+#
+# With motivating examples, we walk through the process of debugging and profiling.
+# The main idea is to narrow down the problem.
+#
+# We demonstrate step by step the way to delve deeper the issue and find the root cause of failures, with the help of debugging logging and the tool Minifier.
+# Firstly determine which component the failure occurs in and then try to generate the smallest snippet of code that can reproduce the failure.
+#
+# When the performance with Inductor is better than that of eager mode, we provide a solid analytical method for performance profiling.
+# We show how to find the time-consuming hotspot with PyTorch Profiler and figure out the operator-level or kernel-level reason to explain the phenomenon.
diff --git a/intermediate_source/jacobians_hessians.py b/intermediate_source/jacobians_hessians.py
new file mode 100644
index 00000000000..b8b96c30a3e
--- /dev/null
+++ b/intermediate_source/jacobians_hessians.py
@@ -0,0 +1,349 @@
+# -*- coding: utf-8 -*-
+"""
+Jacobians, Hessians, hvp, vhp, and more: composing function transforms
+======================================================================
+
+Computing jacobians or hessians are useful in a number of non-traditional
+deep learning models. It is difficult (or annoying) to compute these quantities
+efficiently using PyTorch's regular autodiff APIs
+(``Tensor.backward()``, ``torch.autograd.grad``). PyTorch's 
+`JAX-inspired <https://github.com/google/jax>`_
+`function transforms API <https://pytorch.org/docs/master/func.html>`_
+provides ways of computing various higher-order autodiff quantities
+efficiently.
+
+.. note::
+
+   This tutorial requires PyTorch 2.0.0 or later.
+
+Computing the Jacobian
+----------------------
+"""
+
+import torch
+import torch.nn.functional as F
+from functools import partial
+_ = torch.manual_seed(0)
+
+######################################################################
+# Let's start with a function that we'd like to compute the jacobian of.
+# This is a simple linear function with non-linear activation.
+
+def predict(weight, bias, x):
+    return F.linear(x, weight, bias).tanh()
+
+######################################################################
+# Let's add some dummy data: a weight, a bias, and a feature vector x.
+
+D = 16
+weight = torch.randn(D, D)
+bias = torch.randn(D)
+x = torch.randn(D)  # feature vector
+
+######################################################################
+# Let's think of ``predict`` as a function that maps the input ``x`` from :math:`R^D \to R^D`.
+# PyTorch Autograd computes vector-Jacobian products. In order to compute the full
+# Jacobian of this :math:`R^D \to R^D` function, we would have to compute it row-by-row
+# by using a different unit vector each time.
+
+def compute_jac(xp):
+    jacobian_rows = [torch.autograd.grad(predict(weight, bias, xp), xp, vec)[0]
+                     for vec in unit_vectors]
+    return torch.stack(jacobian_rows)
+
+xp = x.clone().requires_grad_()
+unit_vectors = torch.eye(D)
+
+jacobian = compute_jac(xp)
+
+print(jacobian.shape)
+print(jacobian[0])  # show first row
+
+######################################################################
+# Instead of computing the jacobian row-by-row, we can use PyTorch's
+# ``torch.vmap`` function transform to get rid of the for-loop and vectorize the
+# computation. We can’t directly apply ``vmap`` to ``torch.autograd.grad``;
+# instead, PyTorch provides a ``torch.func.vjp`` transform that composes with
+# ``torch.vmap``:
+
+from torch.func import vmap, vjp
+
+_, vjp_fn = vjp(partial(predict, weight, bias), x)
+
+ft_jacobian, = vmap(vjp_fn)(unit_vectors)
+
+# let's confirm both methods compute the same result
+assert torch.allclose(ft_jacobian, jacobian)
+
+######################################################################
+# In a later tutorial a composition of reverse-mode AD and ``vmap`` will give us
+# per-sample-gradients.
+# In this tutorial, composing reverse-mode AD and ``vmap`` gives us Jacobian
+# computation!
+# Various compositions of ``vmap`` and autodiff transforms can give us different
+# interesting quantities.
+#
+# PyTorch provides ``torch.func.jacrev`` as a convenience function that performs
+# the ``vmap-vjp`` composition to compute jacobians. ``jacrev`` accepts an ``argnums``
+# argument that says which argument we would like to compute Jacobians with
+# respect to.
+
+from torch.func import jacrev
+
+ft_jacobian = jacrev(predict, argnums=2)(weight, bias, x)
+
+# Confirm by running the following:
+assert torch.allclose(ft_jacobian, jacobian)
+
+######################################################################
+# Let's compare the performance of the two ways to compute the jacobian.
+# The function transform version is much faster (and becomes even faster the
+# more outputs there are).
+#
+# In general, we expect that vectorization via ``vmap`` can help eliminate overhead
+# and give better utilization of your hardware.
+#
+# ``vmap`` does this magic by pushing the outer loop down into the function's
+# primitive operations in order to obtain better performance.
+#
+# Let's make a quick function to evaluate performance and deal with
+# microseconds and milliseconds measurements:
+
+def get_perf(first, first_descriptor, second, second_descriptor):
+    """takes torch.benchmark objects and compares delta of second vs first."""
+    faster = second.times[0]
+    slower = first.times[0]
+    gain = (slower-faster)/slower
+    if gain < 0: gain *=-1
+    final_gain = gain*100
+    print(f" Performance delta: {final_gain:.4f} percent improvement with {second_descriptor} ")
+
+######################################################################
+# And then run the performance comparison:
+
+from torch.utils.benchmark import Timer
+
+without_vmap = Timer(stmt="compute_jac(xp)", globals=globals())
+with_vmap = Timer(stmt="jacrev(predict, argnums=2)(weight, bias, x)", globals=globals())
+
+no_vmap_timer = without_vmap.timeit(500)
+with_vmap_timer = with_vmap.timeit(500)
+
+print(no_vmap_timer)
+print(with_vmap_timer)
+
+######################################################################
+# Let's do a relative performance comparison of the above with our ``get_perf`` function:
+
+get_perf(no_vmap_timer, "without vmap",  with_vmap_timer, "vmap")
+
+######################################################################
+# Furthermore, it’s pretty easy to flip the problem around and say we want to
+# compute Jacobians of the parameters to our model (weight, bias) instead of the input
+
+# note the change in input via ``argnums`` parameters of 0,1 to map to weight and bias
+ft_jac_weight, ft_jac_bias = jacrev(predict, argnums=(0, 1))(weight, bias, x)
+
+######################################################################
+# Reverse-mode Jacobian (``jacrev``) vs forward-mode Jacobian (``jacfwd``)
+# ------------------------------------------------------------------------
+#
+# We offer two APIs to compute jacobians: ``jacrev`` and ``jacfwd``:
+#
+# - ``jacrev`` uses reverse-mode AD. As you saw above it is a composition of our
+#   ``vjp`` and ``vmap`` transforms.
+# - ``jacfwd`` uses forward-mode AD. It is implemented as a composition of our
+#   ``jvp`` and ``vmap`` transforms.
+#
+# ``jacfwd`` and ``jacrev`` can be substituted for each other but they have different
+# performance characteristics.
+#
+# As a general rule of thumb, if you’re computing the jacobian of an :math:`R^N \to R^M`
+# function, and there are many more outputs than inputs (for example, :math:`M > N`) then
+# ``jacfwd`` is preferred, otherwise use ``jacrev``. There are exceptions to this rule,
+# but a non-rigorous argument for this follows:
+#
+# In reverse-mode AD, we are computing the jacobian row-by-row, while in
+# forward-mode AD (which computes Jacobian-vector products), we are computing
+# it column-by-column. The Jacobian matrix has M rows and N columns, so if it
+# is taller or wider one way we may prefer the method that deals with fewer
+# rows or columns.
+
+from torch.func import jacrev, jacfwd
+
+######################################################################
+# First, let's benchmark with more inputs than outputs:
+
+Din = 32
+Dout = 2048
+weight = torch.randn(Dout, Din)
+
+bias = torch.randn(Dout)
+x = torch.randn(Din)
+
+# remember the general rule about taller vs wider... here we have a taller matrix:
+print(weight.shape)
+
+using_fwd = Timer(stmt="jacfwd(predict, argnums=2)(weight, bias, x)", globals=globals())
+using_bwd = Timer(stmt="jacrev(predict, argnums=2)(weight, bias, x)", globals=globals())
+
+jacfwd_timing = using_fwd.timeit(500)
+jacrev_timing = using_bwd.timeit(500)
+
+print(f'jacfwd time: {jacfwd_timing}')
+print(f'jacrev time: {jacrev_timing}')
+
+######################################################################
+# and then do a relative benchmark:
+
+get_perf(jacfwd_timing, "jacfwd", jacrev_timing, "jacrev", );
+
+#######################################################################
+# and now the reverse - more outputs (M) than inputs (N):
+
+Din = 2048
+Dout = 32
+weight = torch.randn(Dout, Din)
+bias = torch.randn(Dout)
+x = torch.randn(Din)
+
+using_fwd = Timer(stmt="jacfwd(predict, argnums=2)(weight, bias, x)", globals=globals())
+using_bwd = Timer(stmt="jacrev(predict, argnums=2)(weight, bias, x)", globals=globals())
+
+jacfwd_timing = using_fwd.timeit(500)
+jacrev_timing = using_bwd.timeit(500)
+
+print(f'jacfwd time: {jacfwd_timing}')
+print(f'jacrev time: {jacrev_timing}')
+
+#######################################################################
+# and a relative performance comparison:
+
+get_perf(jacrev_timing, "jacrev", jacfwd_timing, "jacfwd")
+
+#######################################################################
+# Hessian computation with functorch.hessian
+# ------------------------------------------
+# We offer a convenience API to compute hessians: ``torch.func.hessiani``.
+# Hessians are the jacobian of the jacobian (or the partial derivative of
+# the partial derivative, aka second order).
+#
+# This suggests that one can just compose functorch jacobian transforms to
+# compute the Hessian.
+# Indeed, under the hood, ``hessian(f)`` is simply ``jacfwd(jacrev(f))``.
+#
+# Note: to boost performance: depending on your model, you may also want to
+# use ``jacfwd(jacfwd(f))`` or ``jacrev(jacrev(f))`` instead to compute hessians
+# leveraging the rule of thumb above regarding wider vs taller matrices.
+
+from torch.func import hessian
+
+# lets reduce the size in order not to overwhelm Colab. Hessians require
+# significant memory:
+Din = 512
+Dout = 32
+weight = torch.randn(Dout, Din)
+bias = torch.randn(Dout)
+x = torch.randn(Din)
+
+hess_api = hessian(predict, argnums=2)(weight, bias, x)
+hess_fwdfwd = jacfwd(jacfwd(predict, argnums=2), argnums=2)(weight, bias, x)
+hess_revrev = jacrev(jacrev(predict, argnums=2), argnums=2)(weight, bias, x)
+
+#######################################################################
+# Let's verify we have the same result regardless of using hessian API or
+# using ``jacfwd(jacfwd())``.
+
+torch.allclose(hess_api, hess_fwdfwd)
+
+#######################################################################
+# Batch Jacobian and Batch Hessian
+# --------------------------------
+# In the above examples we’ve been operating with a single feature vector.
+# In some cases you might want to take the Jacobian of a batch of outputs
+# with respect to a batch of inputs. That is, given a batch of inputs of
+# shape ``(B, N)`` and a function that goes from :math:`R^N \to R^M`, we would like
+# a Jacobian of shape ``(B, M, N)``.
+#
+# The easiest way to do this is to use ``vmap``:
+
+batch_size = 64
+Din = 31
+Dout = 33
+
+weight = torch.randn(Dout, Din)
+print(f"weight shape = {weight.shape}")
+
+bias = torch.randn(Dout)
+
+x = torch.randn(batch_size, Din)
+
+compute_batch_jacobian = vmap(jacrev(predict, argnums=2), in_dims=(None, None, 0))
+batch_jacobian0 = compute_batch_jacobian(weight, bias, x)
+
+#######################################################################
+# If you have a function that goes from (B, N) -> (B, M) instead and are
+# certain that each input produces an independent output, then it's also
+# sometimes possible to do this without using ``vmap`` by summing the outputs
+# and then computing the Jacobian of that function:
+
+def predict_with_output_summed(weight, bias, x):
+    return predict(weight, bias, x).sum(0)
+
+batch_jacobian1 = jacrev(predict_with_output_summed, argnums=2)(weight, bias, x).movedim(1, 0)
+assert torch.allclose(batch_jacobian0, batch_jacobian1)
+
+#######################################################################
+# If you instead have a function that goes from :math:`R^N \to R^M` but inputs that
+# are batched, you compose ``vmap`` with ``jacrev`` to compute batched jacobians:
+#
+# Finally, batch hessians can be computed similarly. It's easiest to think
+# about them by using ``vmap`` to batch over hessian computation, but in some
+# cases the sum trick also works.
+
+compute_batch_hessian = vmap(hessian(predict, argnums=2), in_dims=(None, None, 0))
+
+batch_hess = compute_batch_hessian(weight, bias, x)
+batch_hess.shape
+
+#######################################################################
+# Computing Hessian-vector products
+# ---------------------------------
+# The naive way to compute a Hessian-vector product (hvp) is to materialize
+# the full Hessian and perform a dot-product with a vector. We can do better:
+# it turns out we don't need to materialize the full Hessian to do this. We'll
+# go through two (of many) different strategies to compute Hessian-vector products:
+# - composing reverse-mode AD with reverse-mode AD
+# - composing reverse-mode AD with forward-mode AD
+#
+# Composing reverse-mode AD with forward-mode AD (as opposed to reverse-mode
+# with reverse-mode) is generally the more memory efficient way to compute a
+# hvp because forward-mode AD doesn't need to construct an Autograd graph and
+# save intermediates for backward:
+
+from torch.func import jvp, grad, vjp
+
+def hvp(f, primals, tangents):
+  return jvp(grad(f), primals, tangents)[1]
+
+#######################################################################
+# Here's some sample usage.
+
+def f(x):
+  return x.sin().sum()
+
+x = torch.randn(2048)
+tangent = torch.randn(2048)
+
+result = hvp(f, (x,), (tangent,))
+
+#######################################################################
+# If PyTorch forward-AD does not have coverage for your operations, then we can
+# instead compose reverse-mode AD with reverse-mode AD:
+
+def hvp_revrev(f, primals, tangents):
+  _, vjp_fn = vjp(grad(f), *primals)
+  return vjp_fn(*tangents)
+
+result_hvp_revrev = hvp_revrev(f, (x,), (tangent,))
+assert torch.allclose(result, result_hvp_revrev[0])
diff --git a/intermediate_source/mario_rl_tutorial.py b/intermediate_source/mario_rl_tutorial.py
new file mode 100755
index 00000000000..03d6396a47e
--- /dev/null
+++ b/intermediate_source/mario_rl_tutorial.py
@@ -0,0 +1,791 @@
+# -*- coding: utf-8 -*-
+"""
+Train a Mario-playing RL Agent
+===============================
+
+**Authors:** `Yuansong Feng <https://github.com/YuansongFeng>`__, `Suraj Subramanian <https://github.com/suraj813>`__, `Howard Wang <https://github.com/hw26>`__, `Steven Guo <https://github.com/GuoYuzhang>`__.
+
+
+This tutorial walks you through the fundamentals of Deep Reinforcement
+Learning. At the end, you will implement an AI-powered Mario (using
+`Double Deep Q-Networks <https://arxiv.org/pdf/1509.06461.pdf>`__) that
+can play the game by itself.
+
+Although no prior knowledge of RL is necessary for this tutorial, you
+can familiarize yourself with these RL
+`concepts <https://spinningup.openai.com/en/latest/spinningup/rl_intro.html>`__,
+and have this handy
+`cheatsheet <https://colab.research.google.com/drive/1eN33dPVtdPViiS1njTW_-r-IYCDTFU7N>`__
+as your companion. The full code is available
+`here <https://github.com/yuansongFeng/MadMario/>`__.
+
+.. figure:: /_static/img/mario.gif
+   :alt: mario
+
+"""
+
+
+######################################################################
+#
+#
+#  .. code-block:: bash
+#
+#      %%bash
+#      pip install gym-super-mario-bros==7.4.0
+#      pip install tensordict==0.3.0
+#      pip install torchrl==0.3.0
+#
+
+import torch
+from torch import nn
+from torchvision import transforms as T
+from PIL import Image
+import numpy as np
+from pathlib import Path
+from collections import deque
+import random, datetime, os
+
+# Gym is an OpenAI toolkit for RL
+import gym
+from gym.spaces import Box
+from gym.wrappers import FrameStack
+
+# NES Emulator for OpenAI Gym
+from nes_py.wrappers import JoypadSpace
+
+# Super Mario environment for OpenAI Gym
+import gym_super_mario_bros
+
+from tensordict import TensorDict
+from torchrl.data import TensorDictReplayBuffer, LazyMemmapStorage
+
+######################################################################
+# RL Definitions
+# """"""""""""""""""
+#
+# **Environment** The world that an agent interacts with and learns from.
+#
+# **Action** :math:`a` : How the Agent responds to the Environment. The
+# set of all possible Actions is called *action-space*.
+#
+# **State** :math:`s` : The current characteristic of the Environment. The
+# set of all possible States the Environment can be in is called
+# *state-space*.
+#
+# **Reward** :math:`r` : Reward is the key feedback from Environment to
+# Agent. It is what drives the Agent to learn and to change its future
+# action. An aggregation of rewards over multiple time steps is called
+# **Return**.
+#
+# **Optimal Action-Value function** :math:`Q^*(s,a)` : Gives the expected
+# return if you start in state :math:`s`, take an arbitrary action
+# :math:`a`, and then for each future time step take the action that
+# maximizes returns. :math:`Q` can be said to stand for the “quality” of
+# the action in a state. We try to approximate this function.
+#
+
+
+######################################################################
+# Environment
+# """"""""""""""""
+#
+# Initialize Environment
+# ------------------------
+#
+# In Mario, the environment consists of tubes, mushrooms and other
+# components.
+#
+# When Mario makes an action, the environment responds with the changed
+# (next) state, reward and other info.
+#
+
+# Initialize Super Mario environment (in v0.26 change render mode to 'human' to see results on the screen)
+if gym.__version__ < '0.26':
+    env = gym_super_mario_bros.make("SuperMarioBros-1-1-v0", new_step_api=True)
+else:
+    env = gym_super_mario_bros.make("SuperMarioBros-1-1-v0", render_mode='rgb', apply_api_compatibility=True)
+
+# Limit the action-space to
+#   0. walk right
+#   1. jump right
+env = JoypadSpace(env, [["right"], ["right", "A"]])
+
+env.reset()
+next_state, reward, done, trunc, info = env.step(action=0)
+print(f"{next_state.shape},\n {reward},\n {done},\n {info}")
+
+
+######################################################################
+# Preprocess Environment
+# ------------------------
+#
+# Environment data is returned to the agent in ``next_state``. As you saw
+# above, each state is represented by a ``[3, 240, 256]`` size array.
+# Often that is more information than our agent needs; for instance,
+# Mario’s actions do not depend on the color of the pipes or the sky!
+#
+# We use **Wrappers** to preprocess environment data before sending it to
+# the agent.
+#
+# ``GrayScaleObservation`` is a common wrapper to transform an RGB image
+# to grayscale; doing so reduces the size of the state representation
+# without losing useful information. Now the size of each state:
+# ``[1, 240, 256]``
+#
+# ``ResizeObservation`` downsamples each observation into a square image.
+# New size: ``[1, 84, 84]``
+#
+# ``SkipFrame`` is a custom wrapper that inherits from ``gym.Wrapper`` and
+# implements the ``step()`` function. Because consecutive frames don’t
+# vary much, we can skip n-intermediate frames without losing much
+# information. The n-th frame aggregates rewards accumulated over each
+# skipped frame.
+#
+# ``FrameStack`` is a wrapper that allows us to squash consecutive frames
+# of the environment into a single observation point to feed to our
+# learning model. This way, we can identify if Mario was landing or
+# jumping based on the direction of his movement in the previous several
+# frames.
+#
+
+
+class SkipFrame(gym.Wrapper):
+    def __init__(self, env, skip):
+        """Return only every `skip`-th frame"""
+        super().__init__(env)
+        self._skip = skip
+
+    def step(self, action):
+        """Repeat action, and sum reward"""
+        total_reward = 0.0
+        for i in range(self._skip):
+            # Accumulate reward and repeat the same action
+            obs, reward, done, trunk, info = self.env.step(action)
+            total_reward += reward
+            if done:
+                break
+        return obs, total_reward, done, trunk, info
+
+
+class GrayScaleObservation(gym.ObservationWrapper):
+    def __init__(self, env):
+        super().__init__(env)
+        obs_shape = self.observation_space.shape[:2]
+        self.observation_space = Box(low=0, high=255, shape=obs_shape, dtype=np.uint8)
+
+    def permute_orientation(self, observation):
+        # permute [H, W, C] array to [C, H, W] tensor
+        observation = np.transpose(observation, (2, 0, 1))
+        observation = torch.tensor(observation.copy(), dtype=torch.float)
+        return observation
+
+    def observation(self, observation):
+        observation = self.permute_orientation(observation)
+        transform = T.Grayscale()
+        observation = transform(observation)
+        return observation
+
+
+class ResizeObservation(gym.ObservationWrapper):
+    def __init__(self, env, shape):
+        super().__init__(env)
+        if isinstance(shape, int):
+            self.shape = (shape, shape)
+        else:
+            self.shape = tuple(shape)
+
+        obs_shape = self.shape + self.observation_space.shape[2:]
+        self.observation_space = Box(low=0, high=255, shape=obs_shape, dtype=np.uint8)
+
+    def observation(self, observation):
+        transforms = T.Compose(
+            [T.Resize(self.shape, antialias=True), T.Normalize(0, 255)]
+        )
+        observation = transforms(observation).squeeze(0)
+        return observation
+
+
+# Apply Wrappers to environment
+env = SkipFrame(env, skip=4)
+env = GrayScaleObservation(env)
+env = ResizeObservation(env, shape=84)
+if gym.__version__ < '0.26':
+    env = FrameStack(env, num_stack=4, new_step_api=True)
+else:
+    env = FrameStack(env, num_stack=4)
+
+
+######################################################################
+# After applying the above wrappers to the environment, the final wrapped
+# state consists of 4 gray-scaled consecutive frames stacked together, as
+# shown above in the image on the left. Each time Mario makes an action,
+# the environment responds with a state of this structure. The structure
+# is represented by a 3-D array of size ``[4, 84, 84]``.
+#
+# .. figure:: /_static/img/mario_env.png
+#    :alt: picture
+#
+#
+
+
+######################################################################
+# Agent
+# """""""""
+#
+# We create a class ``Mario`` to represent our agent in the game. Mario
+# should be able to:
+#
+# -  **Act** according to the optimal action policy based on the current
+#    state (of the environment).
+#
+# -  **Remember** experiences. Experience = (current state, current
+#    action, reward, next state). Mario *caches* and later *recalls* his
+#    experiences to update his action policy.
+#
+# -  **Learn** a better action policy over time
+#
+
+
+class Mario:
+    def __init__():
+        pass
+
+    def act(self, state):
+        """Given a state, choose an epsilon-greedy action"""
+        pass
+
+    def cache(self, experience):
+        """Add the experience to memory"""
+        pass
+
+    def recall(self):
+        """Sample experiences from memory"""
+        pass
+
+    def learn(self):
+        """Update online action value (Q) function with a batch of experiences"""
+        pass
+
+
+######################################################################
+# In the following sections, we will populate Mario’s parameters and
+# define his functions.
+#
+
+
+######################################################################
+# Act
+# --------------
+#
+# For any given state, an agent can choose to do the most optimal action
+# (**exploit**) or a random action (**explore**).
+#
+# Mario randomly explores with a chance of ``self.exploration_rate``; when
+# he chooses to exploit, he relies on ``MarioNet`` (implemented in
+# ``Learn`` section) to provide the most optimal action.
+#
+
+
+class Mario:
+    def __init__(self, state_dim, action_dim, save_dir):
+        self.state_dim = state_dim
+        self.action_dim = action_dim
+        self.save_dir = save_dir
+
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+
+        # Mario's DNN to predict the most optimal action - we implement this in the Learn section
+        self.net = MarioNet(self.state_dim, self.action_dim).float()
+        self.net = self.net.to(device=self.device)
+
+        self.exploration_rate = 1
+        self.exploration_rate_decay = 0.99999975
+        self.exploration_rate_min = 0.1
+        self.curr_step = 0
+
+        self.save_every = 5e5  # no. of experiences between saving Mario Net
+
+    def act(self, state):
+        """
+    Given a state, choose an epsilon-greedy action and update value of step.
+
+    Inputs:
+    state(``LazyFrame``): A single observation of the current state, dimension is (state_dim)
+    Outputs:
+    ``action_idx`` (``int``): An integer representing which action Mario will perform
+    """
+        # EXPLORE
+        if np.random.rand() < self.exploration_rate:
+            action_idx = np.random.randint(self.action_dim)
+
+        # EXPLOIT
+        else:
+            state = state[0].__array__() if isinstance(state, tuple) else state.__array__()
+            state = torch.tensor(state, device=self.device).unsqueeze(0)
+            action_values = self.net(state, model="online")
+            action_idx = torch.argmax(action_values, axis=1).item()
+
+        # decrease exploration_rate
+        self.exploration_rate *= self.exploration_rate_decay
+        self.exploration_rate = max(self.exploration_rate_min, self.exploration_rate)
+
+        # increment step
+        self.curr_step += 1
+        return action_idx
+
+
+######################################################################
+# Cache and Recall
+# ----------------------
+#
+# These two functions serve as Mario’s “memory” process.
+#
+# ``cache()``: Each time Mario performs an action, he stores the
+# ``experience`` to his memory. His experience includes the current
+# *state*, *action* performed, *reward* from the action, the *next state*,
+# and whether the game is *done*.
+#
+# ``recall()``: Mario randomly samples a batch of experiences from his
+# memory, and uses that to learn the game.
+#
+
+
+class Mario(Mario):  # subclassing for continuity
+    def __init__(self, state_dim, action_dim, save_dir):
+        super().__init__(state_dim, action_dim, save_dir)
+        self.memory = TensorDictReplayBuffer(storage=LazyMemmapStorage(100000, device=torch.device("cpu")))
+        self.batch_size = 32
+
+    def cache(self, state, next_state, action, reward, done):
+        """
+        Store the experience to self.memory (replay buffer)
+
+        Inputs:
+        state (``LazyFrame``),
+        next_state (``LazyFrame``),
+        action (``int``),
+        reward (``float``),
+        done(``bool``))
+        """
+        def first_if_tuple(x):
+            return x[0] if isinstance(x, tuple) else x
+        state = first_if_tuple(state).__array__()
+        next_state = first_if_tuple(next_state).__array__()
+
+        state = torch.tensor(state)
+        next_state = torch.tensor(next_state)
+        action = torch.tensor([action])
+        reward = torch.tensor([reward])
+        done = torch.tensor([done])
+
+        # self.memory.append((state, next_state, action, reward, done,))
+        self.memory.add(TensorDict({"state": state, "next_state": next_state, "action": action, "reward": reward, "done": done}, batch_size=[]))
+
+    def recall(self):
+        """
+        Retrieve a batch of experiences from memory
+        """
+        batch = self.memory.sample(self.batch_size).to(self.device)
+        state, next_state, action, reward, done = (batch.get(key) for key in ("state", "next_state", "action", "reward", "done"))
+        return state, next_state, action.squeeze(), reward.squeeze(), done.squeeze()
+
+
+######################################################################
+# Learn
+# --------------
+#
+# Mario uses the `DDQN algorithm <https://arxiv.org/pdf/1509.06461>`__
+# under the hood. DDQN uses two ConvNets - :math:`Q_{online}` and
+# :math:`Q_{target}` - that independently approximate the optimal
+# action-value function.
+#
+# In our implementation, we share feature generator ``features`` across
+# :math:`Q_{online}` and :math:`Q_{target}`, but maintain separate FC
+# classifiers for each. :math:`\theta_{target}` (the parameters of
+# :math:`Q_{target}`) is frozen to prevent updating by backprop. Instead,
+# it is periodically synced with :math:`\theta_{online}` (more on this
+# later).
+#
+# Neural Network
+# ~~~~~~~~~~~~~~~~~~
+
+
+class MarioNet(nn.Module):
+    """mini CNN structure
+  input -> (conv2d + relu) x 3 -> flatten -> (dense + relu) x 2 -> output
+  """
+
+    def __init__(self, input_dim, output_dim):
+        super().__init__()
+        c, h, w = input_dim
+
+        if h != 84:
+            raise ValueError(f"Expecting input height: 84, got: {h}")
+        if w != 84:
+            raise ValueError(f"Expecting input width: 84, got: {w}")
+
+        self.online = self.__build_cnn(c, output_dim)
+
+        self.target = self.__build_cnn(c, output_dim)
+        self.target.load_state_dict(self.online.state_dict())
+
+        # Q_target parameters are frozen.
+        for p in self.target.parameters():
+            p.requires_grad = False
+
+    def forward(self, input, model):
+        if model == "online":
+            return self.online(input)
+        elif model == "target":
+            return self.target(input)
+
+    def __build_cnn(self, c, output_dim):
+        return nn.Sequential(
+            nn.Conv2d(in_channels=c, out_channels=32, kernel_size=8, stride=4),
+            nn.ReLU(),
+            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=4, stride=2),
+            nn.ReLU(),
+            nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1),
+            nn.ReLU(),
+            nn.Flatten(),
+            nn.Linear(3136, 512),
+            nn.ReLU(),
+            nn.Linear(512, output_dim),
+        )
+
+
+######################################################################
+# TD Estimate & TD Target
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# Two values are involved in learning:
+#
+# **TD Estimate** - the predicted optimal :math:`Q^*` for a given state
+# :math:`s`
+#
+# .. math::
+#
+#
+#    {TD}_e = Q_{online}^*(s,a)
+#
+# **TD Target** - aggregation of current reward and the estimated
+# :math:`Q^*` in the next state :math:`s'`
+#
+# .. math::
+#
+#
+#    a' = argmax_{a} Q_{online}(s', a)
+#
+# .. math::
+#
+#
+#    {TD}_t = r + \gamma Q_{target}^*(s',a')
+#
+# Because we don’t know what next action :math:`a'` will be, we use the
+# action :math:`a'` maximizes :math:`Q_{online}` in the next state
+# :math:`s'`.
+#
+# Notice we use the
+# `@torch.no_grad() <https://pytorch.org/docs/stable/generated/torch.no_grad.html#no-grad>`__
+# decorator on ``td_target()`` to disable gradient calculations here
+# (because we don’t need to backpropagate on :math:`\theta_{target}`).
+#
+
+
+class Mario(Mario):
+    def __init__(self, state_dim, action_dim, save_dir):
+        super().__init__(state_dim, action_dim, save_dir)
+        self.gamma = 0.9
+
+    def td_estimate(self, state, action):
+        current_Q = self.net(state, model="online")[
+            np.arange(0, self.batch_size), action
+        ]  # Q_online(s,a)
+        return current_Q
+
+    @torch.no_grad()
+    def td_target(self, reward, next_state, done):
+        next_state_Q = self.net(next_state, model="online")
+        best_action = torch.argmax(next_state_Q, axis=1)
+        next_Q = self.net(next_state, model="target")[
+            np.arange(0, self.batch_size), best_action
+        ]
+        return (reward + (1 - done.float()) * self.gamma * next_Q).float()
+
+
+######################################################################
+# Updating the model
+# ~~~~~~~~~~~~~~~~~~~~~~
+#
+# As Mario samples inputs from his replay buffer, we compute :math:`TD_t`
+# and :math:`TD_e` and backpropagate this loss down :math:`Q_{online}` to
+# update its parameters :math:`\theta_{online}` (:math:`\alpha` is the
+# learning rate ``lr`` passed to the ``optimizer``)
+#
+# .. math::
+#
+#
+#    \theta_{online} \leftarrow \theta_{online} + \alpha \nabla(TD_e - TD_t)
+#
+# :math:`\theta_{target}` does not update through backpropagation.
+# Instead, we periodically copy :math:`\theta_{online}` to
+# :math:`\theta_{target}`
+#
+# .. math::
+#
+#
+#    \theta_{target} \leftarrow \theta_{online}
+#
+#
+
+
+class Mario(Mario):
+    def __init__(self, state_dim, action_dim, save_dir):
+        super().__init__(state_dim, action_dim, save_dir)
+        self.optimizer = torch.optim.Adam(self.net.parameters(), lr=0.00025)
+        self.loss_fn = torch.nn.SmoothL1Loss()
+
+    def update_Q_online(self, td_estimate, td_target):
+        loss = self.loss_fn(td_estimate, td_target)
+        self.optimizer.zero_grad()
+        loss.backward()
+        self.optimizer.step()
+        return loss.item()
+
+    def sync_Q_target(self):
+        self.net.target.load_state_dict(self.net.online.state_dict())
+
+
+######################################################################
+# Save checkpoint
+# ~~~~~~~~~~~~~~~~~~
+#
+
+
+class Mario(Mario):
+    def save(self):
+        save_path = (
+            self.save_dir / f"mario_net_{int(self.curr_step // self.save_every)}.chkpt"
+        )
+        torch.save(
+            dict(model=self.net.state_dict(), exploration_rate=self.exploration_rate),
+            save_path,
+        )
+        print(f"MarioNet saved to {save_path} at step {self.curr_step}")
+
+
+######################################################################
+# Putting it all together
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+
+
+class Mario(Mario):
+    def __init__(self, state_dim, action_dim, save_dir):
+        super().__init__(state_dim, action_dim, save_dir)
+        self.burnin = 1e4  # min. experiences before training
+        self.learn_every = 3  # no. of experiences between updates to Q_online
+        self.sync_every = 1e4  # no. of experiences between Q_target & Q_online sync
+
+    def learn(self):
+        if self.curr_step % self.sync_every == 0:
+            self.sync_Q_target()
+
+        if self.curr_step % self.save_every == 0:
+            self.save()
+
+        if self.curr_step < self.burnin:
+            return None, None
+
+        if self.curr_step % self.learn_every != 0:
+            return None, None
+
+        # Sample from memory
+        state, next_state, action, reward, done = self.recall()
+
+        # Get TD Estimate
+        td_est = self.td_estimate(state, action)
+
+        # Get TD Target
+        td_tgt = self.td_target(reward, next_state, done)
+
+        # Backpropagate loss through Q_online
+        loss = self.update_Q_online(td_est, td_tgt)
+
+        return (td_est.mean().item(), loss)
+
+
+######################################################################
+# Logging
+# --------------
+#
+
+import numpy as np
+import time, datetime
+import matplotlib.pyplot as plt
+
+
+class MetricLogger:
+    def __init__(self, save_dir):
+        self.save_log = save_dir / "log"
+        with open(self.save_log, "w") as f:
+            f.write(
+                f"{'Episode':>8}{'Step':>8}{'Epsilon':>10}{'MeanReward':>15}"
+                f"{'MeanLength':>15}{'MeanLoss':>15}{'MeanQValue':>15}"
+                f"{'TimeDelta':>15}{'Time':>20}\n"
+            )
+        self.ep_rewards_plot = save_dir / "reward_plot.jpg"
+        self.ep_lengths_plot = save_dir / "length_plot.jpg"
+        self.ep_avg_losses_plot = save_dir / "loss_plot.jpg"
+        self.ep_avg_qs_plot = save_dir / "q_plot.jpg"
+
+        # History metrics
+        self.ep_rewards = []
+        self.ep_lengths = []
+        self.ep_avg_losses = []
+        self.ep_avg_qs = []
+
+        # Moving averages, added for every call to record()
+        self.moving_avg_ep_rewards = []
+        self.moving_avg_ep_lengths = []
+        self.moving_avg_ep_avg_losses = []
+        self.moving_avg_ep_avg_qs = []
+
+        # Current episode metric
+        self.init_episode()
+
+        # Timing
+        self.record_time = time.time()
+
+    def log_step(self, reward, loss, q):
+        self.curr_ep_reward += reward
+        self.curr_ep_length += 1
+        if loss:
+            self.curr_ep_loss += loss
+            self.curr_ep_q += q
+            self.curr_ep_loss_length += 1
+
+    def log_episode(self):
+        "Mark end of episode"
+        self.ep_rewards.append(self.curr_ep_reward)
+        self.ep_lengths.append(self.curr_ep_length)
+        if self.curr_ep_loss_length == 0:
+            ep_avg_loss = 0
+            ep_avg_q = 0
+        else:
+            ep_avg_loss = np.round(self.curr_ep_loss / self.curr_ep_loss_length, 5)
+            ep_avg_q = np.round(self.curr_ep_q / self.curr_ep_loss_length, 5)
+        self.ep_avg_losses.append(ep_avg_loss)
+        self.ep_avg_qs.append(ep_avg_q)
+
+        self.init_episode()
+
+    def init_episode(self):
+        self.curr_ep_reward = 0.0
+        self.curr_ep_length = 0
+        self.curr_ep_loss = 0.0
+        self.curr_ep_q = 0.0
+        self.curr_ep_loss_length = 0
+
+    def record(self, episode, epsilon, step):
+        mean_ep_reward = np.round(np.mean(self.ep_rewards[-100:]), 3)
+        mean_ep_length = np.round(np.mean(self.ep_lengths[-100:]), 3)
+        mean_ep_loss = np.round(np.mean(self.ep_avg_losses[-100:]), 3)
+        mean_ep_q = np.round(np.mean(self.ep_avg_qs[-100:]), 3)
+        self.moving_avg_ep_rewards.append(mean_ep_reward)
+        self.moving_avg_ep_lengths.append(mean_ep_length)
+        self.moving_avg_ep_avg_losses.append(mean_ep_loss)
+        self.moving_avg_ep_avg_qs.append(mean_ep_q)
+
+        last_record_time = self.record_time
+        self.record_time = time.time()
+        time_since_last_record = np.round(self.record_time - last_record_time, 3)
+
+        print(
+            f"Episode {episode} - "
+            f"Step {step} - "
+            f"Epsilon {epsilon} - "
+            f"Mean Reward {mean_ep_reward} - "
+            f"Mean Length {mean_ep_length} - "
+            f"Mean Loss {mean_ep_loss} - "
+            f"Mean Q Value {mean_ep_q} - "
+            f"Time Delta {time_since_last_record} - "
+            f"Time {datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S')}"
+        )
+
+        with open(self.save_log, "a") as f:
+            f.write(
+                f"{episode:8d}{step:8d}{epsilon:10.3f}"
+                f"{mean_ep_reward:15.3f}{mean_ep_length:15.3f}{mean_ep_loss:15.3f}{mean_ep_q:15.3f}"
+                f"{time_since_last_record:15.3f}"
+                f"{datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S'):>20}\n"
+            )
+
+        for metric in ["ep_lengths", "ep_avg_losses", "ep_avg_qs", "ep_rewards"]:
+            plt.clf()
+            plt.plot(getattr(self, f"moving_avg_{metric}"), label=f"moving_avg_{metric}")
+            plt.legend()
+            plt.savefig(getattr(self, f"{metric}_plot"))
+
+
+######################################################################
+# Let’s play!
+# """""""""""""""
+#
+# In this example we run the training loop for 40 episodes, but for Mario to truly learn the ways of
+# his world, we suggest running the loop for at least 40,000 episodes!
+#
+use_cuda = torch.cuda.is_available()
+print(f"Using CUDA: {use_cuda}")
+print()
+
+save_dir = Path("checkpoints") / datetime.datetime.now().strftime("%Y-%m-%dT%H-%M-%S")
+save_dir.mkdir(parents=True)
+
+mario = Mario(state_dim=(4, 84, 84), action_dim=env.action_space.n, save_dir=save_dir)
+
+logger = MetricLogger(save_dir)
+
+episodes = 40
+for e in range(episodes):
+
+    state = env.reset()
+
+    # Play the game!
+    while True:
+
+        # Run agent on the state
+        action = mario.act(state)
+
+        # Agent performs action
+        next_state, reward, done, trunc, info = env.step(action)
+
+        # Remember
+        mario.cache(state, next_state, action, reward, done)
+
+        # Learn
+        q, loss = mario.learn()
+
+        # Logging
+        logger.log_step(reward, loss, q)
+
+        # Update state
+        state = next_state
+
+        # Check if end of game
+        if done or info["flag_get"]:
+            break
+
+    logger.log_episode()
+
+    if (e % 20 == 0) or (e == episodes - 1):
+        logger.record(episode=e, epsilon=mario.exploration_rate, step=mario.curr_step)
+
+
+######################################################################
+# Conclusion
+# """""""""""""""
+#
+# In this tutorial, we saw how we can use PyTorch to train a game-playing AI. You can use the same methods
+# to train an AI to play any of the games at the `OpenAI gym <https://gym.openai.com/>`__. Hope you enjoyed this tutorial, feel free to reach us at
+# `our github <https://github.com/yuansongFeng/MadMario/>`__!
diff --git a/intermediate_source/memory_format_tutorial.py b/intermediate_source/memory_format_tutorial.py
new file mode 100644
index 00000000000..b3e3c04cfe2
--- /dev/null
+++ b/intermediate_source/memory_format_tutorial.py
@@ -0,0 +1,413 @@
+# -*- coding: utf-8 -*-
+"""
+Channels Last Memory Format in PyTorch
+*******************************************************
+**Author**: `Vitaly Fedyunin <https://github.com/VitalyFedyunin>`_
+
+.. grid:: 2
+
+    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn
+       :class-card: card-prerequisites
+
+       * What is the channels last memory format in PyTorch?
+       * How can it be used to improve performance on certain operators?
+
+    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites
+       :class-card: card-prerequisites
+
+       * PyTorch v1.5.0
+       * A CUDA-capable GPU
+
+#########################################################################
+# Overview - What is channels last?
+# ---------------------------------
+
+The channels last memory format is an alternative way of ordering NCHW tensors in memory preserving dimensions ordering. Channels last tensors ordered in such a way that channels become the densest dimension (aka storing images pixel-per-pixel).
+
+For example, classic (contiguous) storage of NCHW tensor (in our case it is two 4x4 images with 3 color channels) look like this:
+
+.. figure:: /_static/img/classic_memory_format.png
+   :alt: classic_memory_format
+
+Channels last memory format orders data differently:
+
+.. figure:: /_static/img/channels_last_memory_format.png
+   :alt: channels_last_memory_format
+
+Pytorch supports memory formats by utilizing the existing strides structure.
+For example, 10x3x16x16 batch in Channels last format will have strides equal to (768, 1, 48, 3).
+"""
+
+######################################################################
+# Channels last memory format is implemented for 4D NCHW Tensors only.
+#
+
+######################################################################
+# Memory Format API
+# -----------------------
+#
+# Here is how to convert tensors between contiguous and channels
+# last memory formats.
+
+######################################################################
+# Classic PyTorch contiguous tensor
+import torch
+
+N, C, H, W = 10, 3, 32, 32
+x = torch.empty(N, C, H, W)
+print(x.stride())  # Outputs: (3072, 1024, 32, 1)
+
+######################################################################
+# Conversion operator
+x = x.to(memory_format=torch.channels_last)
+print(x.shape)  # Outputs: (10, 3, 32, 32) as dimensions order preserved
+print(x.stride())  # Outputs: (3072, 1, 96, 3)
+
+######################################################################
+# Back to contiguous
+x = x.to(memory_format=torch.contiguous_format)
+print(x.stride())  # Outputs: (3072, 1024, 32, 1)
+
+######################################################################
+# Alternative option
+x = x.contiguous(memory_format=torch.channels_last)
+print(x.stride())  # Outputs: (3072, 1, 96, 3)
+
+######################################################################
+# Format checks
+print(x.is_contiguous(memory_format=torch.channels_last))  # Outputs: True
+
+######################################################################
+# There are minor difference between the two APIs ``to`` and
+# ``contiguous``. We suggest to stick with ``to`` when explicitly
+# converting memory format of tensor.
+#
+# For general cases the two APIs behave the same. However in special
+# cases for a 4D tensor with size ``NCHW`` when either: ``C==1`` or
+# ``H==1 && W==1``, only ``to`` would generate a proper stride to
+# represent channels last memory format.
+#
+# This is because in either of the two cases above, the memory format
+# of a tensor is ambiguous, i.e. a contiguous tensor with size
+# ``N1HW`` is both ``contiguous`` and channels last in memory storage.
+# Therefore, they are already considered as ``is_contiguous``
+# for the given memory format and hence ``contiguous`` call becomes a
+# no-op and would not update the stride. On the contrary, ``to``
+# would restride tensor with a meaningful stride on dimensions whose
+# sizes are 1 in order to properly represent the intended memory
+# format
+special_x = torch.empty(4, 1, 4, 4)
+print(special_x.is_contiguous(memory_format=torch.channels_last))  # Outputs: True
+print(special_x.is_contiguous(memory_format=torch.contiguous_format))  # Outputs: True
+
+######################################################################
+# Same thing applies to explicit permutation API ``permute``. In
+# special case where ambiguity could occur, ``permute`` does not
+# guarantee to produce a stride that properly carry the intended
+# memory format. We suggest to use ``to`` with explicit memory format
+# to avoid unintended behavior.
+#
+# And a side note that in the extreme case, where three non-batch
+# dimensions are all equal to ``1`` (``C==1 && H==1 && W==1``),
+# current implementation cannot mark a tensor as channels last memory
+# format.
+
+######################################################################
+# Create as channels last
+x = torch.empty(N, C, H, W, memory_format=torch.channels_last)
+print(x.stride())  # Outputs: (3072, 1, 96, 3)
+
+######################################################################
+# ``clone`` preserves memory format
+y = x.clone()
+print(y.stride())  # Outputs: (3072, 1, 96, 3)
+
+######################################################################
+# ``to``, ``cuda``, ``float`` ... preserves memory format
+if torch.cuda.is_available():
+    y = x.cuda()
+    print(y.stride())  # Outputs: (3072, 1, 96, 3)
+
+######################################################################
+# ``empty_like``, ``*_like`` operators preserves memory format
+y = torch.empty_like(x)
+print(y.stride())  # Outputs: (3072, 1, 96, 3)
+
+######################################################################
+# Pointwise operators preserves memory format
+z = x + y
+print(z.stride())  # Outputs: (3072, 1, 96, 3)
+
+######################################################################
+# ``Conv``, ``Batchnorm`` modules using ``cudnn`` backends support channels last
+# (only works for cuDNN >= 7.6). Convolution modules, unlike binary
+# p-wise operator, have channels last as the dominating memory format.
+# If all inputs are in contiguous memory format, the operator
+# produces output in contiguous memory format. Otherwise, output will
+# be in channels last memory format.
+
+if torch.backends.cudnn.is_available() and torch.backends.cudnn.version() >= 7603:
+    model = torch.nn.Conv2d(8, 4, 3).cuda().half()
+    model = model.to(memory_format=torch.channels_last)  # Module parameters need to be channels last
+
+    input = torch.randint(1, 10, (2, 8, 4, 4), dtype=torch.float32, requires_grad=True)
+    input = input.to(device="cuda", memory_format=torch.channels_last, dtype=torch.float16)
+
+    out = model(input)
+    print(out.is_contiguous(memory_format=torch.channels_last))  # Outputs: True
+
+######################################################################
+# When input tensor reaches a operator without channels last support,
+# a permutation should automatically apply in the kernel to restore
+# contiguous on input tensor. This introduces overhead and stops the
+# channels last memory format propagation. Nevertheless, it guarantees
+# correct output.
+
+######################################################################
+# Performance Gains
+# --------------------------------------------------------------------
+# Channels last memory format optimizations are available on both GPU and CPU.
+# On GPU, the most significant performance gains are observed on NVIDIA's
+# hardware with Tensor Cores support running on reduced precision
+# (``torch.float16``).
+# We were able to archive over 22% performance gains with channels last
+# comparing to contiguous format, both while utilizing
+# 'AMP (Automated Mixed Precision)' training scripts.
+# Our scripts uses AMP supplied by NVIDIA
+# https://github.com/NVIDIA/apex.
+#
+# ``python main_amp.py -a resnet50 --b 200 --workers 16 --opt-level O2  ./data``
+
+# opt_level = O2
+# keep_batchnorm_fp32 = None <class 'NoneType'>
+# loss_scale = None <class 'NoneType'>
+# CUDNN VERSION: 7603
+# => creating model 'resnet50'
+# Selected optimization level O2:  FP16 training with FP32 batchnorm and FP32 master weights.
+# Defaults for this optimization level are:
+# enabled                : True
+# opt_level              : O2
+# cast_model_type        : torch.float16
+# patch_torch_functions  : False
+# keep_batchnorm_fp32    : True
+# master_weights         : True
+# loss_scale             : dynamic
+# Processing user overrides (additional kwargs that are not None)...
+# After processing overrides, optimization options are:
+# enabled                : True
+# opt_level              : O2
+# cast_model_type        : torch.float16
+# patch_torch_functions  : False
+# keep_batchnorm_fp32    : True
+# master_weights         : True
+# loss_scale             : dynamic
+# Epoch: [0][10/125] Time 0.866 (0.866) Speed 230.949 (230.949) Loss 0.6735125184 (0.6735) Prec@1 61.000 (61.000) Prec@5 100.000 (100.000)
+# Epoch: [0][20/125] Time 0.259 (0.562) Speed 773.481 (355.693) Loss 0.6968704462 (0.6852) Prec@1 55.000 (58.000) Prec@5 100.000 (100.000)
+# Epoch: [0][30/125] Time 0.258 (0.461) Speed 775.089 (433.965) Loss 0.7877287269 (0.7194) Prec@1 51.500 (55.833) Prec@5 100.000 (100.000)
+# Epoch: [0][40/125] Time 0.259 (0.410) Speed 771.710 (487.281) Loss 0.8285319805 (0.7467) Prec@1 48.500 (54.000) Prec@5 100.000 (100.000)
+# Epoch: [0][50/125] Time 0.260 (0.380) Speed 770.090 (525.908) Loss 0.7370464802 (0.7447) Prec@1 56.500 (54.500) Prec@5 100.000 (100.000)
+# Epoch: [0][60/125] Time 0.258 (0.360) Speed 775.623 (555.728) Loss 0.7592862844 (0.7472) Prec@1 51.000 (53.917) Prec@5 100.000 (100.000)
+# Epoch: [0][70/125] Time 0.258 (0.345) Speed 774.746 (579.115) Loss 1.9698858261 (0.9218) Prec@1 49.500 (53.286) Prec@5 100.000 (100.000)
+# Epoch: [0][80/125] Time 0.260 (0.335) Speed 770.324 (597.659) Loss 2.2505953312 (1.0879) Prec@1 50.500 (52.938) Prec@5 100.000 (100.000)
+
+######################################################################
+# Passing ``--channels-last true`` allows running a model in Channels last format with observed 22% performance gain.
+#
+# ``python main_amp.py -a resnet50 --b 200 --workers 16 --opt-level O2 --channels-last true ./data``
+
+# opt_level = O2
+# keep_batchnorm_fp32 = None <class 'NoneType'>
+# loss_scale = None <class 'NoneType'>
+#
+# CUDNN VERSION: 7603
+#
+# => creating model 'resnet50'
+# Selected optimization level O2:  FP16 training with FP32 batchnorm and FP32 master weights.
+#
+# Defaults for this optimization level are:
+# enabled                : True
+# opt_level              : O2
+# cast_model_type        : torch.float16
+# patch_torch_functions  : False
+# keep_batchnorm_fp32    : True
+# master_weights         : True
+# loss_scale             : dynamic
+# Processing user overrides (additional kwargs that are not None)...
+# After processing overrides, optimization options are:
+# enabled                : True
+# opt_level              : O2
+# cast_model_type        : torch.float16
+# patch_torch_functions  : False
+# keep_batchnorm_fp32    : True
+# master_weights         : True
+# loss_scale             : dynamic
+#
+# Epoch: [0][10/125] Time 0.767 (0.767) Speed 260.785 (260.785) Loss 0.7579724789 (0.7580) Prec@1 53.500 (53.500) Prec@5 100.000 (100.000)
+# Epoch: [0][20/125] Time 0.198 (0.482) Speed 1012.135 (414.716) Loss 0.7007197738 (0.7293) Prec@1 49.000 (51.250) Prec@5 100.000 (100.000)
+# Epoch: [0][30/125] Time 0.198 (0.387) Speed 1010.977 (516.198) Loss 0.7113101482 (0.7233) Prec@1 55.500 (52.667) Prec@5 100.000 (100.000)
+# Epoch: [0][40/125] Time 0.197 (0.340) Speed 1013.023 (588.333) Loss 0.8943189979 (0.7661) Prec@1 54.000 (53.000) Prec@5 100.000 (100.000)
+# Epoch: [0][50/125] Time 0.198 (0.312) Speed 1010.541 (641.977) Loss 1.7113249302 (0.9551) Prec@1 51.000 (52.600) Prec@5 100.000 (100.000)
+# Epoch: [0][60/125] Time 0.198 (0.293) Speed 1011.163 (683.574) Loss 5.8537774086 (1.7716) Prec@1 50.500 (52.250) Prec@5 100.000 (100.000)
+# Epoch: [0][70/125] Time 0.198 (0.279) Speed 1011.453 (716.767) Loss 5.7595844269 (2.3413) Prec@1 46.500 (51.429) Prec@5 100.000 (100.000)
+# Epoch: [0][80/125] Time 0.198 (0.269) Speed 1011.827 (743.883) Loss 2.8196096420 (2.4011) Prec@1 47.500 (50.938) Prec@5 100.000 (100.000)
+
+######################################################################
+# The following list of models has the full support of Channels last and showing 8%-35% performance gains on Volta devices:
+# ``alexnet``, ``mnasnet0_5``, ``mnasnet0_75``, ``mnasnet1_0``, ``mnasnet1_3``, ``mobilenet_v2``, ``resnet101``, ``resnet152``, ``resnet18``, ``resnet34``, ``resnet50``, ``resnext50_32x4d``, ``shufflenet_v2_x0_5``, ``shufflenet_v2_x1_0``, ``shufflenet_v2_x1_5``, ``shufflenet_v2_x2_0``, ``squeezenet1_0``, ``squeezenet1_1``, ``vgg11``, ``vgg11_bn``, ``vgg13``, ``vgg13_bn``, ``vgg16``, ``vgg16_bn``, ``vgg19``, ``vgg19_bn``, ``wide_resnet101_2``, ``wide_resnet50_2``
+#
+
+######################################################################
+# The following list of models has the full support of Channels last and showing 26%-76% performance gains on Intel(R) Xeon(R) Ice Lake (or newer) CPUs:
+# ``alexnet``, ``densenet121``, ``densenet161``, ``densenet169``, ``googlenet``, ``inception_v3``, ``mnasnet0_5``, ``mnasnet1_0``, ``resnet101``, ``resnet152``, ``resnet18``, ``resnet34``, ``resnet50``, ``resnext101_32x8d``, ``resnext50_32x4d``, ``shufflenet_v2_x0_5``, ``shufflenet_v2_x1_0``, ``squeezenet1_0``, ``squeezenet1_1``, ``vgg11``, ``vgg11_bn``, ``vgg13``, ``vgg13_bn``, ``vgg16``, ``vgg16_bn``, ``vgg19``, ``vgg19_bn``, ``wide_resnet101_2``, ``wide_resnet50_2``
+#
+
+######################################################################
+# Converting existing models
+# --------------------------
+#
+# Channels last support is not limited by existing models, as any
+# model can be converted to channels last and propagate format through
+# the graph as soon as input (or certain weight) is formatted
+# correctly.
+#
+
+# Need to be done once, after model initialization (or load)
+model = model.to(memory_format=torch.channels_last)  # Replace with your model
+
+# Need to be done for every input
+input = input.to(memory_format=torch.channels_last)  # Replace with your input
+output = model(input)
+
+#######################################################################
+# However, not all operators fully converted to support channels last
+# (usually returning contiguous output instead). In the example posted
+# above, layers that does not support channels last will stop the
+# memory format propagation. In spite of that, as we have converted the
+# model to channels last format, that means each convolution layer,
+# which has its 4 dimensional weight in channels last memory format,
+# will restore channels last memory format and benefit from faster
+# kernels.
+#
+# But operators that does not support channels last does introduce
+# overhead by permutation. Optionally, you can investigate and identify
+# operators in your model that does not support channels last, if you
+# want to improve the performance of converted model.
+#
+# That means you need to verify the list of used operators
+# against supported operators list https://github.com/pytorch/pytorch/wiki/Operators-with-Channels-Last-support,
+# or introduce memory format checks into eager execution mode and run your model.
+#
+# After running the code below, operators will raise an exception if the output of the
+# operator doesn't match the memory format of the input.
+#
+#
+def contains_cl(args):
+    for t in args:
+        if isinstance(t, torch.Tensor):
+            if t.is_contiguous(memory_format=torch.channels_last) and not t.is_contiguous():
+                return True
+        elif isinstance(t, list) or isinstance(t, tuple):
+            if contains_cl(list(t)):
+                return True
+    return False
+
+
+def print_inputs(args, indent=""):
+    for t in args:
+        if isinstance(t, torch.Tensor):
+            print(indent, t.stride(), t.shape, t.device, t.dtype)
+        elif isinstance(t, list) or isinstance(t, tuple):
+            print(indent, type(t))
+            print_inputs(list(t), indent=indent + "    ")
+        else:
+            print(indent, t)
+
+
+def check_wrapper(fn):
+    name = fn.__name__
+
+    def check_cl(*args, **kwargs):
+        was_cl = contains_cl(args)
+        try:
+            result = fn(*args, **kwargs)
+        except Exception as e:
+            print("`{}` inputs are:".format(name))
+            print_inputs(args)
+            print("-------------------")
+            raise e
+        failed = False
+        if was_cl:
+            if isinstance(result, torch.Tensor):
+                if result.dim() == 4 and not result.is_contiguous(memory_format=torch.channels_last):
+                    print(
+                        "`{}` got channels_last input, but output is not channels_last:".format(name),
+                        result.shape,
+                        result.stride(),
+                        result.device,
+                        result.dtype,
+                    )
+                    failed = True
+        if failed and True:
+            print("`{}` inputs are:".format(name))
+            print_inputs(args)
+            raise Exception("Operator `{}` lost channels_last property".format(name))
+        return result
+
+    return check_cl
+
+
+old_attrs = dict()
+
+
+def attribute(m):
+    old_attrs[m] = dict()
+    for i in dir(m):
+        e = getattr(m, i)
+        exclude_functions = ["is_cuda", "has_names", "numel", "stride", "Tensor", "is_contiguous", "__class__"]
+        if i not in exclude_functions and not i.startswith("_") and "__call__" in dir(e):
+            try:
+                old_attrs[m][i] = e
+                setattr(m, i, check_wrapper(e))
+            except Exception as e:
+                print(i)
+                print(e)
+
+
+attribute(torch.Tensor)
+attribute(torch.nn.functional)
+attribute(torch)
+
+
+######################################################################
+# If you found an operator that doesn't support channels last tensors
+# and you want to contribute, feel free to use following developers
+# guide https://github.com/pytorch/pytorch/wiki/Writing-memory-format-aware-operators.
+#
+
+######################################################################
+# Code below is to recover the attributes of torch.
+
+for (m, attrs) in old_attrs.items():
+    for (k, v) in attrs.items():
+        setattr(m, k, v)
+
+######################################################################
+# Work to do
+# ----------
+# There are still many things to do, such as:
+#
+# - Resolving ambiguity of ``N1HW`` and ``NC11`` Tensors;
+# - Testing of Distributed Training support;
+# - Improving operators coverage.
+#
+# If you have feedback and/or suggestions for improvement, please let us
+# know by creating `an issue <https://github.com/pytorch/pytorch/issues>`_.
+
+######################################################################
+# Conclusion
+# ----------
+#
+# This tutorial introduced the "channels last" memory format and demonstrated
+# how to use it for performance gains. For a practical example of accelerating
+# vision models using channels last, see the post
+# `here <https://pytorch.org/blog/accelerating-pytorch-vision-models-with-channels-last-on-cpu/>`_.
diff --git a/intermediate_source/mnist_train_nas.py b/intermediate_source/mnist_train_nas.py
new file mode 100644
index 00000000000..4ae6d894fce
--- /dev/null
+++ b/intermediate_source/mnist_train_nas.py
@@ -0,0 +1,171 @@
+"""
+Example training code for ``ax_multiobjective_nas_tutorial.py``
+"""
+
+import argparse
+import logging
+import os
+import sys
+import time
+import warnings
+
+import torch
+from IPython.utils import io
+from pytorch_lightning import LightningModule, Trainer
+from pytorch_lightning import loggers as pl_loggers
+from torch import nn
+from torch.nn import functional as F
+from torch.utils.data import DataLoader
+from torchmetrics.functional.classification.accuracy import multiclass_accuracy
+from torchvision import transforms
+from torchvision.datasets import MNIST
+
+warnings.filterwarnings("ignore")  # Disable data logger warnings
+logging.getLogger("pytorch_lightning").setLevel(logging.ERROR)  # Disable GPU/TPU prints
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="train mnist")
+    parser.add_argument(
+        "--log_path", type=str, required=True, help="dir to place tensorboard logs from all trials"
+    )
+    parser.add_argument(
+        "--hidden_size_1", type=int, required=True, help="hidden size layer 1"
+    )
+    parser.add_argument(
+        "--hidden_size_2", type=int, required=True, help="hidden size layer 2"
+    )
+    parser.add_argument("--learning_rate", type=float, required=True, help="learning rate")
+    parser.add_argument("--epochs", type=int, required=True, help="number of epochs")
+    parser.add_argument("--dropout", type=float, required=True, help="dropout probability")
+    parser.add_argument("--batch_size", type=int, required=True, help="batch size")
+    return parser.parse_args()
+
+args = parse_args()
+
+PATH_DATASETS = os.environ.get("PATH_DATASETS", ".")
+
+
+class MnistModel(LightningModule):
+    def __init__(self):
+        super().__init__()
+
+        # Tunable parameters
+        self.hidden_size_1 = args.hidden_size_1
+        self.hidden_size_2 = args.hidden_size_2
+        self.learning_rate = args.learning_rate
+        self.dropout = args.dropout
+        self.batch_size = args.batch_size
+
+        # Set class attributes
+        self.data_dir = PATH_DATASETS
+
+        # Hardcode some dataset specific attributes
+        self.num_classes = 10
+        self.dims = (1, 28, 28)
+        channels, width, height = self.dims
+        self.transform = transforms.Compose(
+            [
+                transforms.ToTensor(),
+                transforms.Normalize((0.1307,), (0.3081,)),
+            ]
+        )
+
+        # Create a PyTorch model
+        layers = [nn.Flatten()]
+        width = channels * width * height
+        hidden_layers = [self.hidden_size_1, self.hidden_size_2]
+        num_params = 0
+        for hidden_size in hidden_layers:
+            if hidden_size > 0:
+                layers.append(nn.Linear(width, hidden_size))
+                layers.append(nn.ReLU())
+                layers.append(nn.Dropout(self.dropout))
+                num_params += width * hidden_size
+                width = hidden_size
+        layers.append(nn.Linear(width, self.num_classes))
+        num_params += width * self.num_classes
+
+        # Save the model and parameter counts
+        self.num_params = num_params
+        self.model = nn.Sequential(*layers)  # No need to use Relu for the last layer
+
+    def forward(self, x):
+        x = self.model(x)
+        return F.log_softmax(x, dim=1)
+
+    def training_step(self, batch, batch_idx):
+        x, y = batch
+        logits = self(x)
+        loss = F.nll_loss(logits, y)
+        return loss
+
+    def validation_step(self, batch, batch_idx):
+        x, y = batch
+        logits = self(x)
+        loss = F.nll_loss(logits, y)
+        preds = torch.argmax(logits, dim=1)
+        acc = multiclass_accuracy(preds, y, num_classes=self.num_classes)
+        self.log("val_acc", acc, prog_bar=False)
+        return loss
+
+    def configure_optimizers(self):
+        optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
+        return optimizer
+
+    def prepare_data(self):
+        MNIST(self.data_dir, train=True, download=True)
+        MNIST(self.data_dir, train=False, download=True)
+
+    def setup(self, stage=None):
+        self.mnist_train = MNIST(self.data_dir, train=True, transform=self.transform)
+        self.mnist_val = MNIST(self.data_dir, train=False, transform=self.transform)
+
+    def train_dataloader(self):
+        return DataLoader(self.mnist_train, batch_size=self.batch_size)
+
+    def val_dataloader(self):
+        return DataLoader(self.mnist_val, batch_size=self.batch_size)
+
+
+def run_training_job():
+
+    mnist_model = MnistModel()
+
+    # Initialize a trainer (don't log anything since things get so slow...)
+    trainer = Trainer(
+        logger=False,
+        max_epochs=args.epochs,
+        enable_progress_bar=False,
+        deterministic=True,  # Do we want a bit of noise?
+        default_root_dir=args.log_path,
+    )
+
+    logger = pl_loggers.TensorBoardLogger(args.log_path)
+
+    print(f"Logging to path: {args.log_path}.")
+
+    # Train the model and log time ⚡
+    start = time.time()
+    trainer.fit(model=mnist_model)
+    end = time.time()
+    train_time = end - start
+    logger.log_metrics({"train_time": end - start})
+
+    # Compute the validation accuracy once and log the score
+    with io.capture_output() as captured:
+        val_accuracy = trainer.validate()[0]["val_acc"]
+    logger.log_metrics({"val_acc": val_accuracy})
+
+    # Log the number of model parameters
+    num_params = trainer.model.num_params
+    logger.log_metrics({"num_params": num_params})
+
+    logger.save()
+
+    # Print outputs
+    print(f"train time: {train_time}, val acc: {val_accuracy}, num_params: {num_params}")
+
+
+if __name__ == "__main__":
+    run_training_job()
diff --git a/intermediate_source/model_parallel_tutorial.rst b/intermediate_source/model_parallel_tutorial.rst
new file mode 100644
index 00000000000..d687caf4634
--- /dev/null
+++ b/intermediate_source/model_parallel_tutorial.rst
@@ -0,0 +1,10 @@
+Single-Machine Model Parallel Best Practices
+============================================
+
+This tutorial has been deprecated.
+
+Redirecting to latest parallelism APIs in 3 seconds...
+
+.. raw:: html
+
+   <meta http-equiv="Refresh" content="3; url='https://pytorch.org/tutorials/beginner/dist_overview.html#parallelism-apis'" />
diff --git a/intermediate_source/neural_tangent_kernels.py b/intermediate_source/neural_tangent_kernels.py
new file mode 100644
index 00000000000..d70d5c5dca3
--- /dev/null
+++ b/intermediate_source/neural_tangent_kernels.py
@@ -0,0 +1,256 @@
+# -*- coding: utf-8 -*-
+"""
+Neural Tangent Kernels
+======================
+
+The neural tangent kernel (NTK) is a kernel that describes
+`how a neural network evolves during training <https://en.wikipedia.org/wiki/Neural_tangent_kernel>`_.
+There has been a lot of research around it `in recent years <https://arxiv.org/abs/1806.07572>`_.
+This tutorial, inspired by the implementation of `NTKs in JAX <https://github.com/google/neural-tangents>`_
+(see `Fast Finite Width Neural Tangent Kernel <https://arxiv.org/abs/2206.08720>`_ for details),
+demonstrates how to easily compute this quantity using ``torch.func``,
+composable function transforms for PyTorch.
+
+.. note::
+
+   This tutorial requires PyTorch 2.6.0 or later.
+
+Setup
+-----
+
+First, some setup. Let's define a simple CNN that we wish to compute the NTK of.
+"""
+
+import torch
+import torch.nn as nn
+from torch.func import functional_call, vmap, vjp, jvp, jacrev
+
+if torch.accelerator.is_available() and torch.accelerator.device_count() > 0:
+    device = torch.accelerator.current_accelerator()
+else:
+    device = torch.device("cpu")
+
+
+class CNN(nn.Module):
+    def __init__(self):
+        super(CNN, self).__init__()
+        self.conv1 = nn.Conv2d(3, 32, (3, 3))
+        self.conv2 = nn.Conv2d(32, 32, (3, 3))
+        self.conv3 = nn.Conv2d(32, 32, (3, 3))
+        self.fc = nn.Linear(21632, 10)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = x.relu()
+        x = self.conv2(x)
+        x = x.relu()
+        x = self.conv3(x)
+        x = x.flatten(1)
+        x = self.fc(x)
+        return x
+
+######################################################################
+# And let's generate some random data
+
+x_train = torch.randn(20, 3, 32, 32, device=device)
+x_test = torch.randn(5, 3, 32, 32, device=device)
+
+######################################################################
+# Create a function version of the model
+# --------------------------------------
+#
+# ``torch.func`` transforms operate on functions. In particular, to compute the NTK,
+# we will need a function that accepts the parameters of the model and a single
+# input (as opposed to a batch of inputs!) and returns a single output.
+#
+# We'll use ``torch.func.functional_call``, which allows us to call an ``nn.Module``
+# using different parameters/buffers, to help accomplish the first step.
+#
+# Keep in mind that the model was originally written to accept a batch of input
+# data points. In our CNN example, there are no inter-batch operations. That
+# is, each data point in the batch is independent of other data points. With
+# this assumption in mind, we can easily generate a function that evaluates the
+# model on a single data point:
+
+
+net = CNN().to(device)
+
+# Detaching the parameters because we won't be calling Tensor.backward().
+params = {k: v.detach() for k, v in net.named_parameters()}
+
+def fnet_single(params, x):
+    return functional_call(net, params, (x.unsqueeze(0),)).squeeze(0)
+
+######################################################################
+# Compute the NTK: method 1 (Jacobian contraction)
+# ------------------------------------------------
+# We're ready to compute the empirical NTK. The empirical NTK for two data
+# points :math:`x_1` and :math:`x_2` is defined as the matrix product between the Jacobian
+# of the model evaluated at :math:`x_1` and the Jacobian of the model evaluated at
+# :math:`x_2`:
+#
+# .. math::
+#
+#    J_{net}(x_1) J_{net}^T(x_2)
+#
+# In the batched case where :math:`x_1` is a batch of data points and :math:`x_2` is a
+# batch of data points, then we want the matrix product between the Jacobians
+# of all combinations of data points from :math:`x_1` and :math:`x_2`.
+#
+# The first method consists of doing just that - computing the two Jacobians,
+# and contracting them. Here's how to compute the NTK in the batched case:
+
+def empirical_ntk_jacobian_contraction(fnet_single, params, x1, x2):
+    # Compute J(x1)
+    jac1 = vmap(jacrev(fnet_single), (None, 0))(params, x1)
+    jac1 = jac1.values()
+    jac1 = [j.flatten(2) for j in jac1]
+
+    # Compute J(x2)
+    jac2 = vmap(jacrev(fnet_single), (None, 0))(params, x2)
+    jac2 = jac2.values()
+    jac2 = [j.flatten(2) for j in jac2]
+
+    # Compute J(x1) @ J(x2).T
+    result = torch.stack([torch.einsum('Naf,Mbf->NMab', j1, j2) for j1, j2 in zip(jac1, jac2)])
+    result = result.sum(0)
+    return result
+
+result = empirical_ntk_jacobian_contraction(fnet_single, params, x_train, x_test)
+print(result.shape)
+
+######################################################################
+# In some cases, you may only want the diagonal or the trace of this quantity,
+# especially if you know beforehand that the network architecture results in an
+# NTK where the non-diagonal elements can be approximated by zero. It's easy to
+# adjust the above function to do that:
+
+def empirical_ntk_jacobian_contraction(fnet_single, params, x1, x2, compute='full'):
+    # Compute J(x1)
+    jac1 = vmap(jacrev(fnet_single), (None, 0))(params, x1)
+    jac1 = jac1.values()
+    jac1 = [j.flatten(2) for j in jac1]
+
+    # Compute J(x2)
+    jac2 = vmap(jacrev(fnet_single), (None, 0))(params, x2)
+    jac2 = jac2.values()
+    jac2 = [j.flatten(2) for j in jac2]
+
+    # Compute J(x1) @ J(x2).T
+    einsum_expr = None
+    if compute == 'full':
+        einsum_expr = 'Naf,Mbf->NMab'
+    elif compute == 'trace':
+        einsum_expr = 'Naf,Maf->NM'
+    elif compute == 'diagonal':
+        einsum_expr = 'Naf,Maf->NMa'
+    else:
+        assert False
+
+    result = torch.stack([torch.einsum(einsum_expr, j1, j2) for j1, j2 in zip(jac1, jac2)])
+    result = result.sum(0)
+    return result
+
+result = empirical_ntk_jacobian_contraction(fnet_single, params, x_train, x_test, 'trace')
+print(result.shape)
+
+######################################################################
+# The asymptotic time complexity of this method is :math:`N O [FP]` (time to
+# compute the Jacobians) + :math:`N^2 O^2 P` (time to contract the Jacobians),
+# where :math:`N` is the batch size of :math:`x_1` and :math:`x_2`, :math:`O`
+# is the model's output size, :math:`P` is the total number of parameters, and
+# :math:`[FP]` is the cost of a single forward pass through the model. See
+# section 3.2 in
+# `Fast Finite Width Neural Tangent Kernel <https://arxiv.org/abs/2206.08720>`_
+# for details.
+#
+# Compute the NTK: method 2 (NTK-vector products)
+# -----------------------------------------------
+#
+# The next method we will discuss is a way to compute the NTK using NTK-vector
+# products.
+#
+# This method reformulates NTK as a stack of NTK-vector products applied to
+# columns of an identity matrix :math:`I_O` of size :math:`O\times O`
+# (where :math:`O` is the output size of the model):
+#
+# .. math::
+#
+#    J_{net}(x_1) J_{net}^T(x_2) = J_{net}(x_1) J_{net}^T(x_2) I_{O} = \left[J_{net}(x_1) \left[J_{net}^T(x_2) e_o\right]\right]_{o=1}^{O},
+#
+# where :math:`e_o\in \mathbb{R}^O` are column vectors of the identity matrix
+# :math:`I_O`.
+#
+# - Let :math:`\textrm{vjp}_o = J_{net}^T(x_2) e_o`. We can use
+#   a vector-Jacobian product to compute this.
+# - Now, consider :math:`J_{net}(x_1) \textrm{vjp}_o`. This is a
+#   Jacobian-vector product!
+# - Finally, we can run the above computation in parallel over all
+#   columns :math:`e_o` of :math:`I_O` using ``vmap``.
+#
+# This suggests that we can use a combination of reverse-mode AD (to compute
+# the vector-Jacobian product) and forward-mode AD (to compute the
+# Jacobian-vector product) to compute the NTK.
+#
+# Let's code that up:
+
+def empirical_ntk_ntk_vps(func, params, x1, x2, compute='full'):
+    def get_ntk(x1, x2):
+        def func_x1(params):
+            return func(params, x1)
+
+        def func_x2(params):
+            return func(params, x2)
+
+        output, vjp_fn = vjp(func_x1, params)
+
+        def get_ntk_slice(vec):
+            # This computes ``vec @ J(x2).T``
+            # `vec` is some unit vector (a single slice of the Identity matrix)
+            vjps = vjp_fn(vec)
+            # This computes ``J(X1) @ vjps``
+            _, jvps = jvp(func_x2, (params,), vjps)
+            return jvps
+
+        # Here's our identity matrix
+        basis = torch.eye(output.numel(), dtype=output.dtype, device=output.device).view(output.numel(), -1)
+        return vmap(get_ntk_slice)(basis)
+
+    # ``get_ntk(x1, x2)`` computes the NTK for a single data point x1, x2
+    # Since the x1, x2 inputs to ``empirical_ntk_ntk_vps`` are batched,
+    # we actually wish to compute the NTK between every pair of data points
+    # between {x1} and {x2}. That's what the ``vmaps`` here do.
+    result = vmap(vmap(get_ntk, (None, 0)), (0, None))(x1, x2)
+
+    if compute == 'full':
+        return result
+    if compute == 'trace':
+        return torch.einsum('NMKK->NM', result)
+    if compute == 'diagonal':
+        return torch.einsum('NMKK->NMK', result)
+
+# Disable TensorFloat-32 for convolutions on Ampere+ GPUs to sacrifice performance in favor of accuracy
+with torch.backends.cudnn.flags(allow_tf32=False):
+    result_from_jacobian_contraction = empirical_ntk_jacobian_contraction(fnet_single, params, x_test, x_train)
+    result_from_ntk_vps = empirical_ntk_ntk_vps(fnet_single, params, x_test, x_train)
+
+assert torch.allclose(result_from_jacobian_contraction, result_from_ntk_vps, atol=1e-5)
+
+######################################################################
+# Our code for ``empirical_ntk_ntk_vps`` looks like a direct translation from
+# the math above! This showcases the power of function transforms: good luck
+# trying to write an efficient version of the above by only using
+# ``torch.autograd.grad``.
+#
+# The asymptotic time complexity of this method is :math:`N^2 O [FP]`, where
+# :math:`N` is the batch size of :math:`x_1` and :math:`x_2`, :math:`O` is the
+# model's output size, and :math:`[FP]` is the cost of a single forward pass
+# through the model. Hence this method performs more forward passes through the
+# network than method 1, Jacobian contraction (:math:`N^2 O` instead of
+# :math:`N O`), but avoids the contraction cost altogether (no :math:`N^2 O^2 P`
+# term, where :math:`P` is the total number of model's parameters). Therefore,
+# this method is preferable when :math:`O P` is large relative to :math:`[FP]`,
+# such as fully-connected (not convolutional) models with many outputs :math:`O`.
+# Memory-wise, both methods should be comparable. See section 3.3 in
+# `Fast Finite Width Neural Tangent Kernel <https://arxiv.org/abs/2206.08720>`_
+# for details.
diff --git a/intermediate_source/nlp_from_scratch_index.rst b/intermediate_source/nlp_from_scratch_index.rst
new file mode 100644
index 00000000000..95f70746cbc
--- /dev/null
+++ b/intermediate_source/nlp_from_scratch_index.rst
@@ -0,0 +1,48 @@
+NLP from Scratch
+================
+
+In these three-part series you will build and train
+a basic character-level Recurrent Neural Network (RNN) to classify words.
+
+You will learn:
+
+* How to construct Recurrent Neural Networks from scratch
+* Essential data handling techniques for NLP
+* How to train an RNN to identify the language origin of words.
+
+Before you begin, we recommend that you review the following:
+
+* `PyTorch Learn the Basics series <https://pytorch.org/tutorials/beginner/basics/intro.html>`__
+* `How to install PyTorch <https://pytorch.org/get-started/locally/>`__
+
+.. grid:: 3
+
+     .. grid-item-card:: :octicon:`file-code;1em`
+        NLP From Scratch - Part 1: Classifying Names with a Character-Level RNN
+        :link: https://pytorch.org/tutorials/intermediate/char_rnn_classification_tutorial.html
+        :link-type: url
+
+        Learn how to use an RNN to classify names into their language of origin.
+        +++
+        :octicon:`code;1em` Code
+
+     .. grid-item-card:: :octicon:`file-code;1em`
+        NLP From Scratch - Part 2: Generating Names with a Character-Level RNN
+        :link: https://pytorch.org/tutorials/intermediate/char_rnn_generation_tutorial.html
+        :link-type: url
+
+        Expand the RNN we created in Part 1 to generate names from languages.
+        +++
+        :octicon:`code;1em` Code
+
+     .. grid-item-card:: :octicon:`file-code;1em`
+        NLP From Scratch - Part 3: Translation with a Sequence to Sequence Network and Attention
+        :link: https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html
+        :link-type: url
+
+        Create a sequence-to-sequence model that can translate your text from French
+        to English.
+        +++
+        :octicon:`code;1em` Code
+
+
diff --git a/intermediate_source/nvfuser_intro_tutorial.rst b/intermediate_source/nvfuser_intro_tutorial.rst
new file mode 100644
index 00000000000..965500d71ee
--- /dev/null
+++ b/intermediate_source/nvfuser_intro_tutorial.rst
@@ -0,0 +1,8 @@
+Getting Started - Accelerate Your Scripts with nvFuser
+======================================================
+
+This tutorial has been deprecated. Redirecting to homepage in 3 seconds...
+
+.. raw:: html
+
+   <meta http-equiv="Refresh" content="3; url='https://pytorch.org/tutorials/'" />
diff --git a/intermediate_source/optimizer_step_in_backward_tutorial.py b/intermediate_source/optimizer_step_in_backward_tutorial.py
new file mode 100644
index 00000000000..fd72f733c50
--- /dev/null
+++ b/intermediate_source/optimizer_step_in_backward_tutorial.py
@@ -0,0 +1,268 @@
+"""
+
+How to save memory by fusing the optimizer step into the backward pass
+======================================================================
+
+Hello there! This tutorial aims to showcase one way of reducing the
+memory footprint of a training loop by reducing the memory taken by
+the *gradients*. Say you have a model and you're interested in ways to
+optimize memory to avoid ``Out of Memory`` (OOM) errors or simply to ooze
+more out of your GPU. Well, you _might_ be in luck (if gradients take up
+a portion of your memory and you do not need to do gradient accumulation).
+We will explore the following:
+
+1. What takes up memory during your training or finetuning loop,
+2. How to capture and visualize memory snapshots to determine the bottleneck,
+3. The new ``Tensor.register_post_accumulate_grad_hook(hook)`` API, and finally,
+4. How everything fits together in 10 lines to achieve memory savings.
+
+To run this tutorial, you will need:
+
+*  PyTorch 2.1.0 or newer with ``torchvision``
+*  1 CUDA GPU if you'd like to run the memory visualizations locally.
+   Otherwise, this technique would benefit similarly on any device.
+
+Let us start by importing the required modules and models. We will use a
+vision transformer model from torchvision, but feel free to substitute
+with your own model. We will also use ``torch.optim.Adam`` as our optimizer,
+but, again, feel free to substitute with your own optimizer.
+
+"""
+
+import torch
+from torchvision import models
+from pickle import dump
+
+model = models.vit_l_16(weights='DEFAULT').cuda()
+optimizer = torch.optim.Adam(model.parameters())
+
+###############################################################################
+# Now let's define our typical training loop. You should use real images when
+# training, but for the purposes of this tutorial, we are passing in fake
+# inputs and not worrying about loading any actual data.
+
+IMAGE_SIZE = 224
+
+def train(model, optimizer):
+  # create our fake image input: tensor shape is batch_size, channels, height, width
+  fake_image = torch.rand(1, 3, IMAGE_SIZE, IMAGE_SIZE).cuda()
+
+  # call our forward and backward
+  loss = model.forward(fake_image)
+  loss.sum().backward()
+
+  # optimizer update
+  optimizer.step()
+  optimizer.zero_grad()
+
+###############################################################################
+# Memory usage during training
+# """"""""""""""""""""""""""""
+# We are about to look at some memory snapshots, so we should be prepared to
+# analyze them properly. Typically, training memory consists of:
+#
+#  * Model parameters (size P)
+#  * Activations that are saved for the backward pass (size A)
+#  * Gradients, which are the same size as the model parameters, so size G = P.
+#  * Optimizer state, which is proportional to the size of the parameters. In
+#    this case, the state for Adam requires 2x the model parameters, so size O = 2P.
+#  * Intermediate tensors, which are allocated throughout the compute. We will
+#    not worry about them for now as they are usually small and ephemeral.
+#
+# Capturing and visualizing memory snapshots
+# """"""""""""""""""""""""""""""""""""""""""
+# Let's get us a memory snapshot! As your code runs, consider what you may expect
+# the CUDA memory timeline to look like.
+
+# tell CUDA to start recording memory allocations
+torch.cuda.memory._record_memory_history(enabled='all')
+
+# train 3 steps
+for _ in range(3):
+  train(model, optimizer)
+
+# save a snapshot of the memory allocations
+s = torch.cuda.memory._snapshot()
+with open(f"snapshot.pickle", "wb") as f:
+    dump(s, f)
+
+# tell CUDA to stop recording memory allocations now
+torch.cuda.memory._record_memory_history(enabled=None)
+
+###############################################################################
+# Now open up the snapshot in the CUDA Memory Visualizer at
+# https://pytorch.org/memory_viz by dragging and dropping the
+# ``snapshot.pickle`` file. Does the memory timeline match your expectations?
+# 
+# .. figure:: /_static/img/optim_step_in_bwd/snapshot.jpg
+#    :alt: snapshot.png loaded into CUDA Memory Visualizer
+# 
+# The model parameters have already been loaded in memory before the training
+# step, so we see a chunk of memory devoted to the weights right off the bat.
+# As we start our forward pass, memory is allocated gradually for the activations,
+# or the tensors we are saving to be able to compute gradients in the backward pass.
+# Once we start the backward pass, the activations are gradually freed while memory
+# of the gradients starts building up.
+# 
+# Lastly, as the optimizer kicks in, its state will be lazily initialized, so we 
+# should see the optimizer state memory gradually increase during the optimizer
+# step of the first training loop only. In future loops, the optimizer memory
+# will remain and be updated in-place. The memory for the gradients is then
+# freed accordingly at the end of every training loop when ``zero_grad`` is called.
+# 
+# Where is the memory bottleneck in this training loop? Or, in other words,
+# where is the peak memory?
+# 
+# The peak memory usage is during the optimizer step! Note the memory then
+# consists of ~1.2GB of parameters, ~1.2GB of gradients, and ~2.4GB=2*1.2GB of
+# the optimizer state as expected. The last ~1.2GB comes from Adam optimizer
+# requiring memory for intermediates, totaling to ~6GB of peak memory.
+# Technically, you can remove the need for the last 1.2GB for optimizer
+# intermediates if you set ``Adam(model.parameters(), foreach=False)`` which
+# would trade off runtime for memory. If switching off the ``foreach`` runtime
+# optimization is sufficient in memory savings for you, nice, but please
+# read on if you're curious how this tutorial can help you do better!
+# With the technique we will soon introduce, we will reduce peak memory by
+# removing the need for the ~1.2GB of **gradients memory** as well as **optimizer
+# intermediates memory**. Now, what would you expect the new peak memory to be?
+# The answer will be revealed in the `next` snapshot.
+#
+# DISCLAIMER: This technique is **not** for all
+# """""""""""""""""""""""""""""""""""""""""""""
+# Before we get too excited, we have to consider whether this technique is applicable
+# for `your` use case. This is NOT a silver bullet! The technique of fusing the 
+# optimizer step into the backward only targets reducing *gradient* memory (and as a side effect also optimizer intermediates
+# memory). Thus, the more sizable the memory taken up by the gradients, the more
+# tantamount the memory reduction. In our example above, the gradients eat up 20% 
+# of the memory pie, which is quite sizable!
+#
+# This may not be the case for you, for example, if your weights are already tiny,
+# (say, due to applying LoRa,) then the gradients do not take much space in your
+# training loop and the wins are way less exciting. In that case, you should
+# first try other techniques like activations checkpointing, distributed
+# training, quantization, or reducing the batch size. Then, when the gradients
+# are part of the bottleneck again, come back to this tutorial!
+# 
+# Still here? Cool, let's introduce our new ``register_post_accumulate_grad_hook(hook)``
+# API on Tensor.
+#
+# ``Tensor.register_post_accumulate_grad_hook(hook)`` API and our technique
+# """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
+# Our technique relies on not having to save the gradients during ``backward()``. Instead,
+# once a gradient has been accumulated, we will immediately apply the optimizer to
+# the corresponding parameter and drop that gradient entirely! This removes the need
+# for holding onto a big buffer of gradients until the optimizer step.
+#
+# So how can we unlock the behavior of applying the optimizer more eagerly? In our 2.1
+# release, we've added a new API :func:`torch.Tensor.register_post_accumulate_grad_hook`
+# that would allow us to add a hook onto a Tensor once its ``.grad`` field has been
+# accumulated. We will encapsulate the optimizer step into this hook. How?
+# 
+# How everything fits together in 10 lines
+# """"""""""""""""""""""""""""""""""""""""
+# Remember our model and optimizer setup from the beginning? I'll leave them commented
+# out below so we don't spend resources rerunning the code.
+#
+# .. code-block:: python
+#
+#    model = models.vit_l_16(weights='DEFAULT').cuda()
+#    optimizer = torch.optim.Adam(model.parameters())
+
+# Instead of having just *one* optimizer, we will have a ``dict`` of optimizers
+# for every parameter so we could reference them in our hook.
+optimizer_dict = {p: torch.optim.Adam([p], foreach=False) for p in model.parameters()}
+
+# Define our hook, which will call the optimizer ``step()`` and ``zero_grad()``
+def optimizer_hook(parameter) -> None:
+  optimizer_dict[parameter].step()
+  optimizer_dict[parameter].zero_grad()
+
+# Register the hook onto every parameter
+for p in model.parameters():
+   p.register_post_accumulate_grad_hook(optimizer_hook)
+
+# Now remember our previous ``train()`` function? Since the optimizer has been
+# fused into the backward, we can remove the optimizer step and zero_grad calls.
+def train(model):
+  # create our fake image input: tensor shape is batch_size, channels, height, width
+  fake_image = torch.rand(1, 3, IMAGE_SIZE, IMAGE_SIZE).cuda()
+
+  # call our forward and backward
+  loss = model.forward(fake_image)
+  loss.sum().backward()
+
+  # optimizer update --> no longer needed!
+  # optimizer.step()
+  # optimizer.zero_grad()
+
+########################################################################
+# That took about 10 lines of changes in our sample model, which is neat.
+# However, for real models, it could be a fairly intrusive change to switch
+# out the optimizer for an optimizer dictionary, especially for those who use
+# ``LRScheduler``s or manipulate optimizer configuration throughout the
+# training epochs. Working out this API with those changes will be more
+# involved and will likely require moving more configuration into global
+# state but should not be impossible. That said, a next step for PyTorch
+# is to make this API easier to adopt with LRSchedulers and other features
+# you are already used to.
+# 
+# But let me get back to convincing you that this technique is worth it.
+# We will consult our friend, the memory snapshot.
+
+# delete optimizer memory from before to get a clean slate for the next
+# memory snapshot
+del optimizer
+
+# tell CUDA to start recording memory allocations
+torch.cuda.memory._record_memory_history(enabled='all')
+
+# train 3 steps. note that we no longer pass the optimizer into train()
+for _ in range(3):
+  train(model)
+
+# save a snapshot of the memory allocations
+s = torch.cuda.memory._snapshot()
+with open(f"snapshot-opt-in-bwd.pickle", "wb") as f:
+    dump(s, f)
+
+# tell CUDA to stop recording memory allocations now
+torch.cuda.memory._record_memory_history(enabled=None)
+
+###############################################################################
+# Yes, take some time to drag your snapshot into the CUDA Memory Visualizer.
+# 
+# .. figure:: /_static/img/optim_step_in_bwd/snapshot_opt_in_bwd.jpg
+#    :alt: snapshot.png loaded into CUDA Memory Visualizer
+#
+# Several major observations:
+#  1. There is no more optimizer step! Right...we fused that into the backward.
+#  2. Likewise, the backward drags longer and there are more random allocations
+#     for intermediates. This is expected, as the optimizer step requires 
+#     intermediates.
+#  3. Most importantly! The peak memory is lower! It is now ~4GB (which I
+#     hope maps closely to your earlier expectation). 
+# 
+# Note that there is no longer any big chunk of memory allocated for the gradients
+# compared to before, accounting for ~1.2GB of memory savings. Instead, we've freed
+# each gradient very quickly after they've been computed by moving the optimizer 
+# step as far ahead as we can. Woohoo! By the way, the other ~1.2GB of memory savings
+# comes from breaking apart the optimizer into per-parameter optimizers, so the
+# intermediates have proportionally shrunk. This detail is `less important` than
+# the gradient memory savings, as you can get optimizer intermediates savings
+# from just turning ``foreach=False`` without this technique.
+# 
+# You may be correctly wondering: if we saved 2.4GB of memory, why is the peak memory
+# NOT 6GB - 2.4GB = 3.6GB? Well, the peak has moved! The peak is now near the start
+# of the backward step, when we still have activations in memory, where before, the peak
+# was during the optimizer step when the activations had been freed. The ~0.4GB difference
+# accounting for ~4.0GB - ~3.6GB is thus due to the activations memory. One can then
+# imagine that this technique can be coupled with activations checkpointing for more
+# memory wins.
+#
+# Conclusion
+# """"""""""
+# In this tutorial, we learned about the memory saving technique of
+# fusing the optimizer into the backward step through the new
+# ``Tensor.register_post_accumulate_grad_hook()`` API and *when* to apply this
+# technique (when gradients memory is significant). Along the way, we also learned
+# about memory snapshots, which are generally useful in memory optimization.
diff --git a/intermediate_source/parametrizations.py b/intermediate_source/parametrizations.py
new file mode 100644
index 00000000000..59cff1d241c
--- /dev/null
+++ b/intermediate_source/parametrizations.py
@@ -0,0 +1,393 @@
+# -*- coding: utf-8 -*-
+"""
+Parametrizations Tutorial
+=========================
+**Author**: `Mario Lezcano <https://github.com/lezcano>`_
+
+Regularizing deep-learning models is a surprisingly challenging task.
+Classical techniques such as penalty methods often fall short when applied
+on deep models due to the complexity of the function being optimized.
+This is particularly problematic when working with ill-conditioned models.
+Examples of these are RNNs trained on long sequences and GANs. A number
+of techniques have been proposed in recent years to regularize these
+models and improve their convergence. On recurrent models, it has been
+proposed to control the singular values of the recurrent kernel for the
+RNN to be well-conditioned. This can be achieved, for example, by making
+the recurrent kernel `orthogonal <https://en.wikipedia.org/wiki/Orthogonal_matrix>`_.
+Another way to regularize recurrent models is via
+"`weight normalization <https://pytorch.org/docs/stable/generated/torch.nn.utils.weight_norm.html>`_".
+This approach proposes to decouple the learning of the parameters from the
+learning of their norms.  To do so, the parameter is divided by its
+`Frobenius norm <https://en.wikipedia.org/wiki/Matrix_norm#Frobenius_norm>`_
+and a separate parameter encoding its norm is learned.
+A similar regularization was proposed for GANs under the name of
+"`spectral normalization <https://pytorch.org/docs/stable/generated/torch.nn.utils.spectral_norm.html>`_". This method
+controls the Lipschitz constant of the network by dividing its parameters by
+their `spectral norm <https://en.wikipedia.org/wiki/Matrix_norm#Special_cases>`_,
+rather than their Frobenius norm.
+
+All these methods have a common pattern: they all transform a parameter
+in an appropriate way before using it. In the first case, they make it orthogonal by
+using a function that maps matrices to orthogonal matrices. In the case of weight
+and spectral normalization, they divide the original parameter by its norm.
+
+More generally, all these examples use a function to put extra structure on the parameters.
+In other words, they use a function to constrain the parameters.
+
+In this tutorial, you will learn how to implement and use this pattern to put
+constraints on your model. Doing so is as easy as writing your own ``nn.Module``.
+
+Requirements: ``torch>=1.9.0``
+
+Implementing parametrizations by hand
+-------------------------------------
+
+Assume that we want to have a square linear layer with symmetric weights, that is,
+with weights ``X`` such that ``X = Xᵀ``. One way to do so is
+to copy the upper-triangular part of the matrix into its lower-triangular part
+"""
+
+import torch
+import torch.nn as nn
+import torch.nn.utils.parametrize as parametrize
+
+def symmetric(X):
+    return X.triu() + X.triu(1).transpose(-1, -2)
+
+X = torch.rand(3, 3)
+A = symmetric(X)
+assert torch.allclose(A, A.T)  # A is symmetric
+print(A)                       # Quick visual check
+
+###############################################################################
+# We can then use this idea to implement a linear layer with symmetric weights
+class LinearSymmetric(nn.Module):
+    def __init__(self, n_features):
+        super().__init__()
+        self.weight = nn.Parameter(torch.rand(n_features, n_features))
+
+    def forward(self, x):
+        A = symmetric(self.weight)
+        return x @ A
+
+###############################################################################
+# The layer can be then used as a regular linear layer
+layer = LinearSymmetric(3)
+out = layer(torch.rand(8, 3))
+
+###############################################################################
+# This implementation, although correct and self-contained, presents a number of problems:
+#
+# 1) It reimplements the layer. We had to implement the linear layer as ``x @ A``. This is
+#    not very problematic for a linear layer, but imagine having to reimplement a CNN or a
+#    Transformer...
+# 2) It does not separate the layer and the parametrization.  If the parametrization were
+#    more difficult, we would have to rewrite its code for each layer that we want to use it
+#    in.
+# 3) It recomputes the parametrization every time we use the layer. If we use the layer
+#    several times during the forward pass, (imagine the recurrent kernel of an RNN), it
+#    would compute the same ``A`` every time that the layer is called.
+#
+# Introduction to parametrizations
+# --------------------------------
+#
+# Parametrizations can solve all these problems as well as others.
+#
+# Let's start by reimplementing the code above using ``torch.nn.utils.parametrize``.
+# The only thing that we have to do is to write the parametrization as a regular ``nn.Module``
+class Symmetric(nn.Module):
+    def forward(self, X):
+        return X.triu() + X.triu(1).transpose(-1, -2)
+
+###############################################################################
+# This is all we need to do. Once we have this, we can transform any regular layer into a
+# symmetric layer by doing
+layer = nn.Linear(3, 3)
+parametrize.register_parametrization(layer, "weight", Symmetric())
+
+###############################################################################
+# Now, the matrix of the linear layer is symmetric
+A = layer.weight
+assert torch.allclose(A, A.T)  # A is symmetric
+print(A)                       # Quick visual check
+
+###############################################################################
+# We can do the same thing with any other layer. For example, we can create a CNN with
+# `skew-symmetric <https://en.wikipedia.org/wiki/Skew-symmetric_matrix>`_ kernels.
+# We use a similar parametrization, copying the upper-triangular part with signs
+# reversed into the lower-triangular part
+class Skew(nn.Module):
+    def forward(self, X):
+        A = X.triu(1)
+        return A - A.transpose(-1, -2)
+
+
+cnn = nn.Conv2d(in_channels=5, out_channels=8, kernel_size=3)
+parametrize.register_parametrization(cnn, "weight", Skew())
+# Print a few kernels
+print(cnn.weight[0, 1])
+print(cnn.weight[2, 2])
+
+###############################################################################
+# Inspecting a parametrized module
+# --------------------------------
+#
+# When a module is parametrized, we find that the module has changed in three ways:
+#
+# 1) ``model.weight`` is now a property
+#
+# 2) It has a new ``module.parametrizations`` attribute
+#
+# 3) The unparametrized weight has been moved to ``module.parametrizations.weight.original``
+#
+# |
+# After parametrizing ``weight``, ``layer.weight`` is turned into a
+# `Python property <https://docs.python.org/3/library/functions.html#property>`_.
+# This property computes ``parametrization(weight)`` every time we request ``layer.weight``
+# just as we did in our implementation of ``LinearSymmetric`` above.
+#
+# Registered parametrizations are stored under a ``parametrizations`` attribute within the module.
+layer = nn.Linear(3, 3)
+print(f"Unparametrized:\n{layer}")
+parametrize.register_parametrization(layer, "weight", Symmetric())
+print(f"\nParametrized:\n{layer}")
+
+###############################################################################
+# This ``parametrizations`` attribute is an ``nn.ModuleDict``, and it can be accessed as such
+print(layer.parametrizations)
+print(layer.parametrizations.weight)
+
+###############################################################################
+# Each element of this ``nn.ModuleDict`` is a ``ParametrizationList``, which behaves like an
+# ``nn.Sequential``. This list will allow us to concatenate parametrizations on one weight.
+# Since this is a list, we can access the parametrizations indexing it. Here's
+# where our ``Symmetric`` parametrization sits
+print(layer.parametrizations.weight[0])
+
+###############################################################################
+# The other thing that we notice is that, if we print the parameters, we see that the
+# parameter ``weight`` has been moved
+print(dict(layer.named_parameters()))
+
+###############################################################################
+# It now sits under ``layer.parametrizations.weight.original``
+print(layer.parametrizations.weight.original)
+
+###############################################################################
+# Besides these three small differences, the parametrization is doing exactly the same
+# as our manual implementation
+symmetric = Symmetric()
+weight_orig = layer.parametrizations.weight.original
+print(torch.dist(layer.weight, symmetric(weight_orig)))
+
+###############################################################################
+# Parametrizations are first-class citizens
+# -----------------------------------------
+#
+# Since ``layer.parametrizations`` is an ``nn.ModuleList``, it means that the parametrizations
+# are properly registered as submodules of the original module. As such, the same rules
+# for registering parameters in a module apply to register a parametrization.
+# For example, if a parametrization has parameters, these will be moved from CPU
+# to CUDA when calling ``model = model.cuda()``.
+#
+# Caching the value of a parametrization
+# --------------------------------------
+#
+# Parametrizations come with an inbuilt caching system via the context manager
+# ``parametrize.cached()``
+class NoisyParametrization(nn.Module):
+    def forward(self, X):
+        print("Computing the Parametrization")
+        return X
+
+layer = nn.Linear(4, 4)
+parametrize.register_parametrization(layer, "weight", NoisyParametrization())
+print("Here, layer.weight is recomputed every time we call it")
+foo = layer.weight + layer.weight.T
+bar = layer.weight.sum()
+with parametrize.cached():
+    print("Here, it is computed just the first time layer.weight is called")
+    foo = layer.weight + layer.weight.T
+    bar = layer.weight.sum()
+
+###############################################################################
+# Concatenating parametrizations
+# ------------------------------
+#
+# Concatenating two parametrizations is as easy as registering them on the same tensor.
+# We may use this to create more complex parametrizations from simpler ones. For example, the
+# `Cayley map <https://en.wikipedia.org/wiki/Cayley_transform#Matrix_map>`_
+# maps the skew-symmetric matrices to the orthogonal matrices of positive determinant. We can
+# concatenate ``Skew`` and a parametrization that implements the Cayley map to get a layer with
+# orthogonal weights
+class CayleyMap(nn.Module):
+    def __init__(self, n):
+        super().__init__()
+        self.register_buffer("Id", torch.eye(n))
+
+    def forward(self, X):
+        # (I + X)(I - X)^{-1}
+        return torch.linalg.solve(self.Id - X, self.Id + X)
+
+layer = nn.Linear(3, 3)
+parametrize.register_parametrization(layer, "weight", Skew())
+parametrize.register_parametrization(layer, "weight", CayleyMap(3))
+X = layer.weight
+print(torch.dist(X.T @ X, torch.eye(3)))  # X is orthogonal
+
+###############################################################################
+# This may also be used to prune a parametrized module, or to reuse parametrizations. For example,
+# the matrix exponential maps the symmetric matrices to the Symmetric Positive Definite (SPD) matrices
+# But the matrix exponential also maps the skew-symmetric matrices to the orthogonal matrices.
+# Using these two facts, we may reuse the parametrizations before to our advantage
+class MatrixExponential(nn.Module):
+    def forward(self, X):
+        return torch.matrix_exp(X)
+
+layer_orthogonal = nn.Linear(3, 3)
+parametrize.register_parametrization(layer_orthogonal, "weight", Skew())
+parametrize.register_parametrization(layer_orthogonal, "weight", MatrixExponential())
+X = layer_orthogonal.weight
+print(torch.dist(X.T @ X, torch.eye(3)))         # X is orthogonal
+
+layer_spd = nn.Linear(3, 3)
+parametrize.register_parametrization(layer_spd, "weight", Symmetric())
+parametrize.register_parametrization(layer_spd, "weight", MatrixExponential())
+X = layer_spd.weight
+print(torch.dist(X, X.T))                        # X is symmetric
+print((torch.linalg.eigvalsh(X) > 0.).all())  # X is positive definite
+
+###############################################################################
+# Initializing parametrizations
+# -----------------------------
+#
+# Parametrizations come with a mechanism to initialize them. If we implement a method
+# ``right_inverse`` with signature
+#
+# .. code-block:: python
+#
+#     def right_inverse(self, X: Tensor) -> Tensor
+#
+# it will be used when assigning to the parametrized tensor.
+#
+# Let's upgrade our implementation of the ``Skew`` class to support this
+class Skew(nn.Module):
+    def forward(self, X):
+        A = X.triu(1)
+        return A - A.transpose(-1, -2)
+
+    def right_inverse(self, A):
+        # We assume that A is skew-symmetric
+        # We take the upper-triangular elements, as these are those used in the forward
+        return A.triu(1)
+
+###############################################################################
+# We may now initialize a layer that is parametrized with ``Skew``
+layer = nn.Linear(3, 3)
+parametrize.register_parametrization(layer, "weight", Skew())
+X = torch.rand(3, 3)
+X = X - X.T                             # X is now skew-symmetric
+layer.weight = X                        # Initialize layer.weight to be X
+print(torch.dist(layer.weight, X))      # layer.weight == X
+
+###############################################################################
+# This ``right_inverse`` works as expected when we concatenate parametrizations.
+# To see this, let's upgrade the Cayley parametrization to also support being initialized
+class CayleyMap(nn.Module):
+    def __init__(self, n):
+        super().__init__()
+        self.register_buffer("Id", torch.eye(n))
+
+    def forward(self, X):
+        # Assume X skew-symmetric
+        # (I + X)(I - X)^{-1}
+        return torch.linalg.solve(self.Id - X, self.Id + X)
+
+    def right_inverse(self, A):
+        # Assume A orthogonal
+        # See https://en.wikipedia.org/wiki/Cayley_transform#Matrix_map
+        # (A - I)(A + I)^{-1}
+        return torch.linalg.solve(A + self.Id, self.Id - A)
+
+layer_orthogonal = nn.Linear(3, 3)
+parametrize.register_parametrization(layer_orthogonal, "weight", Skew())
+parametrize.register_parametrization(layer_orthogonal, "weight", CayleyMap(3))
+# Sample an orthogonal matrix with positive determinant
+X = torch.empty(3, 3)
+nn.init.orthogonal_(X)
+if X.det() < 0.:
+    X[0].neg_()
+layer_orthogonal.weight = X
+print(torch.dist(layer_orthogonal.weight, X))  # layer_orthogonal.weight == X
+
+###############################################################################
+# This initialization step can be written more succinctly as
+layer_orthogonal.weight = nn.init.orthogonal_(layer_orthogonal.weight)
+
+###############################################################################
+# The name of this method comes from the fact that we would often expect
+# that ``forward(right_inverse(X)) == X``. This is a direct way of rewriting that
+# the forward after the initialization with value ``X`` should return the value ``X``.
+# This constraint is not strongly enforced in practice. In fact, at times, it might be of
+# interest to relax this relation. For example, consider the following implementation
+# of a randomized pruning method:
+class PruningParametrization(nn.Module):
+    def __init__(self, X, p_drop=0.2):
+        super().__init__()
+        # sample zeros with probability p_drop
+        mask = torch.full_like(X, 1.0 - p_drop)
+        self.mask = torch.bernoulli(mask)
+
+    def forward(self, X):
+        return X * self.mask
+
+    def right_inverse(self, A):
+        return A
+
+###############################################################################
+# In this case, it is not true that for every matrix A ``forward(right_inverse(A)) == A``.
+# This is only true when the matrix ``A`` has zeros in the same positions as the mask.
+# Even then, if we assign a tensor to a pruned parameter, it will comes as no surprise
+# that tensor will be, in fact, pruned
+layer = nn.Linear(3, 4)
+X = torch.rand_like(layer.weight)
+print(f"Initialization matrix:\n{X}")
+parametrize.register_parametrization(layer, "weight", PruningParametrization(layer.weight))
+layer.weight = X
+print(f"\nInitialized weight:\n{layer.weight}")
+
+###############################################################################
+# Removing parametrizations
+# -------------------------
+#
+# We may remove all the parametrizations from a parameter or a buffer in a module
+# by using ``parametrize.remove_parametrizations()``
+layer = nn.Linear(3, 3)
+print("Before:")
+print(layer)
+print(layer.weight)
+parametrize.register_parametrization(layer, "weight", Skew())
+print("\nParametrized:")
+print(layer)
+print(layer.weight)
+parametrize.remove_parametrizations(layer, "weight")
+print("\nAfter. Weight has skew-symmetric values but it is unconstrained:")
+print(layer)
+print(layer.weight)
+
+###############################################################################
+# When removing a parametrization, we may choose to leave the original parameter (i.e. that in
+# ``layer.parametriations.weight.original``) rather than its parametrized version by setting
+# the flag ``leave_parametrized=False``
+layer = nn.Linear(3, 3)
+print("Before:")
+print(layer)
+print(layer.weight)
+parametrize.register_parametrization(layer, "weight", Skew())
+print("\nParametrized:")
+print(layer)
+print(layer.weight)
+parametrize.remove_parametrizations(layer, "weight", leave_parametrized=False)
+print("\nAfter. Same as Before:")
+print(layer)
+print(layer.weight)
diff --git a/intermediate_source/per_sample_grads.py b/intermediate_source/per_sample_grads.py
new file mode 100644
index 00000000000..a5ece9303fc
--- /dev/null
+++ b/intermediate_source/per_sample_grads.py
@@ -0,0 +1,225 @@
+# -*- coding: utf-8 -*-
+"""
+Per-sample-gradients
+====================
+
+What is it?
+-----------
+
+Per-sample-gradient computation is computing the gradient for each and every
+sample in a batch of data. It is a useful quantity in differential privacy,
+meta-learning, and optimization research.
+
+.. note::
+
+   This tutorial requires PyTorch 2.0.0 or later.
+
+"""
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+torch.manual_seed(0)
+
+# Here's a simple CNN and loss function:
+
+class SimpleCNN(nn.Module):
+    def __init__(self):
+        super(SimpleCNN, self).__init__()
+        self.conv1 = nn.Conv2d(1, 32, 3, 1)
+        self.conv2 = nn.Conv2d(32, 64, 3, 1)
+        self.fc1 = nn.Linear(9216, 128)
+        self.fc2 = nn.Linear(128, 10)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = F.relu(x)
+        x = self.conv2(x)
+        x = F.relu(x)
+        x = F.max_pool2d(x, 2)
+        x = torch.flatten(x, 1)
+        x = self.fc1(x)
+        x = F.relu(x)
+        x = self.fc2(x)
+        output = F.log_softmax(x, dim=1)
+        return output
+
+def loss_fn(predictions, targets):
+    return F.nll_loss(predictions, targets)
+
+
+######################################################################
+# Let’s generate a batch of dummy data and pretend that we’re working with an MNIST dataset.
+# The dummy images are 28 by 28 and we use a minibatch of size 64.
+
+device = 'cuda'
+
+num_models = 10
+batch_size = 64
+data = torch.randn(batch_size, 1, 28, 28, device=device)
+
+targets = torch.randint(10, (64,), device=device)
+
+######################################################################
+# In regular model training, one would forward the minibatch through the model,
+# and then call .backward() to compute gradients.  This would generate an
+# 'average' gradient of the entire mini-batch:
+
+model = SimpleCNN().to(device=device)
+predictions = model(data)  # move the entire mini-batch through the model
+
+loss = loss_fn(predictions, targets)
+loss.backward()  # back propagate the 'average' gradient of this mini-batch
+
+######################################################################
+# In contrast to the above approach, per-sample-gradient computation is
+# equivalent to:
+#
+# - for each individual sample of the data, perform a forward and a backward
+#   pass to get an individual (per-sample) gradient.
+
+def compute_grad(sample, target):
+    sample = sample.unsqueeze(0)  # prepend batch dimension for processing
+    target = target.unsqueeze(0)
+
+    prediction = model(sample)
+    loss = loss_fn(prediction, target)
+
+    return torch.autograd.grad(loss, list(model.parameters()))
+
+
+def compute_sample_grads(data, targets):
+    """ manually process each sample with per sample gradient """
+    sample_grads = [compute_grad(data[i], targets[i]) for i in range(batch_size)]
+    sample_grads = zip(*sample_grads)
+    sample_grads = [torch.stack(shards) for shards in sample_grads]
+    return sample_grads
+
+per_sample_grads = compute_sample_grads(data, targets)
+
+######################################################################
+# ``sample_grads[0]`` is the per-sample-grad for model.conv1.weight.
+# ``model.conv1.weight.shape`` is ``[32, 1, 3, 3]``; notice how there is one
+# gradient, per sample, in the batch for a total of 64.
+
+print(per_sample_grads[0].shape)
+
+######################################################################
+# Per-sample-grads, *the efficient way*, using function transforms
+# ----------------------------------------------------------------
+# We can compute per-sample-gradients efficiently by using function transforms.
+#
+# The ``torch.func`` function transform API transforms over functions.
+# Our strategy is to define a function that computes the loss and then apply
+# transforms to construct a function that computes per-sample-gradients.
+#
+# We'll use the ``torch.func.functional_call`` function to treat an ``nn.Module``
+# like a function.
+#
+# First, let’s extract the state from ``model`` into two dictionaries,
+# parameters and buffers. We'll be detaching them because we won't use
+# regular PyTorch autograd (e.g. Tensor.backward(), torch.autograd.grad).
+
+from torch.func import functional_call, vmap, grad
+
+params = {k: v.detach() for k, v in model.named_parameters()}
+buffers = {k: v.detach() for k, v in model.named_buffers()}
+
+######################################################################
+# Next, let's define a function to compute the loss of the model given a
+# single input rather than a batch of inputs. It is important that this
+# function accepts the parameters, the input, and the target, because we will
+# be transforming over them.
+#
+# Note - because the model was originally written to handle batches, we’ll
+# use ``torch.unsqueeze`` to add a batch dimension.
+
+def compute_loss(params, buffers, sample, target):
+    batch = sample.unsqueeze(0)
+    targets = target.unsqueeze(0)
+
+    predictions = functional_call(model, (params, buffers), (batch,))
+    loss = loss_fn(predictions, targets)
+    return loss
+
+######################################################################
+# Now, let’s use the ``grad`` transform to create a new function that computes
+# the gradient with respect to the first argument of ``compute_loss``
+# (i.e. the ``params``).
+
+ft_compute_grad = grad(compute_loss)
+
+######################################################################
+# The ``ft_compute_grad`` function computes the gradient for a single
+# (sample, target) pair. We can use ``vmap`` to get it to compute the gradient
+# over an entire batch of samples and targets. Note that
+# ``in_dims=(None, None, 0, 0)`` because we wish to map ``ft_compute_grad`` over
+# the 0th dimension of the data and targets, and use the same ``params`` and
+# buffers for each.
+
+ft_compute_sample_grad = vmap(ft_compute_grad, in_dims=(None, None, 0, 0))
+
+######################################################################
+# Finally, let's used our transformed function to compute per-sample-gradients:
+
+ft_per_sample_grads = ft_compute_sample_grad(params, buffers, data, targets)
+
+######################################################################
+# we can double check that the results using ``grad`` and ``vmap`` match the
+# results of hand processing each one individually:
+
+for per_sample_grad, ft_per_sample_grad in zip(per_sample_grads, ft_per_sample_grads.values()):
+    assert torch.allclose(per_sample_grad, ft_per_sample_grad, atol=1.2e-1, rtol=1e-5)
+
+######################################################################
+# A quick note: there are limitations around what types of functions can be
+# transformed by ``vmap``. The best functions to transform are ones that are pure
+# functions: a function where the outputs are only determined by the inputs,
+# and that have no side effects (e.g. mutation). ``vmap`` is unable to handle
+# mutation of arbitrary Python data structures, but it is able to handle many
+# in-place PyTorch operations.
+#
+# Performance comparison
+# ----------------------
+#
+# Curious about how the performance of ``vmap`` compares?
+#
+# Currently the best results are obtained on newer GPU's such as the A100
+# (Ampere) where we've seen up to 25x speedups on this example, but here are
+# some results on our build machines:
+
+def get_perf(first, first_descriptor, second, second_descriptor):
+    """takes torch.benchmark objects and compares delta of second vs first."""
+    second_res = second.times[0]
+    first_res = first.times[0]
+
+    gain = (first_res-second_res)/first_res
+    if gain < 0: gain *=-1 
+    final_gain = gain*100
+
+    print(f"Performance delta: {final_gain:.4f} percent improvement with {first_descriptor} ")
+
+from torch.utils.benchmark import Timer
+
+without_vmap = Timer(stmt="compute_sample_grads(data, targets)", globals=globals())
+with_vmap = Timer(stmt="ft_compute_sample_grad(params, buffers, data, targets)",globals=globals())
+no_vmap_timing = without_vmap.timeit(100)
+with_vmap_timing = with_vmap.timeit(100)
+
+print(f'Per-sample-grads without vmap {no_vmap_timing}')
+print(f'Per-sample-grads with vmap {with_vmap_timing}')
+
+get_perf(with_vmap_timing, "vmap", no_vmap_timing, "no vmap")
+
+######################################################################
+# There are other optimized solutions (like in https://github.com/pytorch/opacus)
+# to computing per-sample-gradients in PyTorch that also perform better than
+# the naive method. But it’s cool that composing ``vmap`` and ``grad`` give us a
+# nice speedup.
+#
+# In general, vectorization with ``vmap`` should be faster than running a function
+# in a for-loop and competitive with manual batching. There are some exceptions
+# though, like if we haven’t implemented the ``vmap`` rule for a particular
+# operation or if the underlying kernels weren’t optimized for older hardware
+# (GPUs). If you see any of these cases, please let us know by opening an issue
+# at on GitHub.
diff --git a/intermediate_source/pinmem_nonblock.py b/intermediate_source/pinmem_nonblock.py
new file mode 100644
index 00000000000..4d82a06a989
--- /dev/null
+++ b/intermediate_source/pinmem_nonblock.py
@@ -0,0 +1,769 @@
+# -*- coding: utf-8 -*-
+"""
+A guide on good usage of ``non_blocking`` and ``pin_memory()`` in PyTorch
+=========================================================================
+
+**Author**: `Vincent Moens <https://github.com/vmoens>`_
+
+Introduction
+------------
+
+Transferring data from the CPU to the GPU is fundamental in many PyTorch applications.
+It's crucial for users to understand the most effective tools and options available for moving data between devices.
+This tutorial examines two key methods for device-to-device data transfer in PyTorch:
+:meth:`~torch.Tensor.pin_memory` and :meth:`~torch.Tensor.to` with the ``non_blocking=True`` option.
+
+What you will learn
+~~~~~~~~~~~~~~~~~~~
+
+Optimizing the transfer of tensors from the CPU to the GPU can be achieved through asynchronous transfers and memory
+pinning. However, there are important considerations:
+
+- Using ``tensor.pin_memory().to(device, non_blocking=True)`` can be up to twice as slow as a straightforward ``tensor.to(device)``.
+- Generally, ``tensor.to(device, non_blocking=True)`` is an effective choice for enhancing transfer speed.
+- While ``cpu_tensor.to("cuda", non_blocking=True).mean()`` executes correctly, attempting
+  ``cuda_tensor.to("cpu", non_blocking=True).mean()`` will result in erroneous outputs.
+
+Preamble
+~~~~~~~~
+
+The performance reported in this tutorial are conditioned on the system used to build the tutorial.
+Although the conclusions are applicable across different systems, the specific observations may vary slightly
+depending on the hardware available, especially on older hardware.
+The primary objective of this tutorial is to offer a theoretical framework for understanding CPU to GPU data transfers.
+However, any design decisions should be tailored to individual cases and guided by benchmarked throughput measurements,
+as well as the specific requirements of the task at hand.
+
+"""
+
+import torch
+
+assert torch.cuda.is_available(), "A cuda device is required to run this tutorial"
+
+
+######################################################################
+#
+# This tutorial requires tensordict to be installed. If you don't have tensordict in your environment yet, install it
+# by running the following command in a separate cell:
+#
+# .. code-block:: bash
+#
+#    # Install tensordict with the following command
+#    !pip3 install tensordict
+#
+# We start by outlining the theory surrounding these concepts, and then move to concrete test examples of the features.
+#
+#
+# Background
+# ----------
+#
+#   .. _pinned_memory_background:
+#
+# Memory management basics
+# ~~~~~~~~~~~~~~~~~~~~~~~~
+#
+#   .. _pinned_memory_memory:
+#
+# When one creates a CPU tensor in PyTorch, the content of this tensor needs to be placed
+# in memory. The memory we talk about here is a rather complex concept worth looking at carefully.
+# We distinguish two types of memory that are handled by the Memory Management Unit: the RAM (for simplicity)
+# and the swap space on disk (which may or may not be the hard drive). Together, the available space in disk and RAM (physical memory)
+# make up the virtual memory, which is an abstraction of the total resources available.
+# In short, the virtual memory makes it so that the available space is larger than what can be found on RAM in isolation
+# and creates the illusion that the main memory is larger than it actually is.
+#
+# In normal circumstances, a regular CPU tensor is pageable which means that it is divided in blocks called pages that
+# can live anywhere in the virtual memory (both in RAM or on disk). As mentioned earlier, this has the advantage that
+# the memory seems larger than what the main memory actually is.
+#
+# Typically, when a program accesses a page that is not in RAM, a "page fault" occurs and the operating system (OS) then brings
+# back this page into RAM ("swap in" or "page in").
+# In turn, the OS may have to swap out (or "page out") another page to make room for the new page.
+#
+# In contrast to pageable memory, a pinned (or page-locked or non-pageable) memory is a type of memory that cannot
+# be swapped out to disk.
+# It allows for faster and more predictable access times, but has the downside that it is more limited than the
+# pageable memory (aka the main memory).
+#
+# .. figure:: /_static/img/pinmem/pinmem.png
+#    :alt:
+#
+# CUDA and (non-)pageable memory
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+#   .. _pinned_memory_cuda_pageable_memory:
+#
+# To understand how CUDA copies a tensor from CPU to CUDA, let's consider the two scenarios above:
+#
+# - If the memory is page-locked, the device can access the memory directly in the main memory. The memory addresses are well
+#   defined and functions that need to read these data can be significantly accelerated.
+# - If the memory is pageable, all the pages will have to be brought to the main memory before being sent to the GPU.
+#   This operation may take time and is less predictable than when executed on page-locked tensors.
+#
+# More precisely, when CUDA sends pageable data from CPU to GPU, it must first create a page-locked copy of that data
+# before making the transfer.
+#
+# Asynchronous vs. Synchronous Operations with ``non_blocking=True`` (CUDA ``cudaMemcpyAsync``)
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+#   .. _pinned_memory_async_sync:
+#
+# When executing a copy from a host (such as, CPU) to a device (such as, GPU), the CUDA toolkit offers modalities to do these
+# operations synchronously or asynchronously with respect to the host.
+#
+# In practice, when calling :meth:`~torch.Tensor.to`, PyTorch always makes a call to
+# `cudaMemcpyAsync <https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1g85073372f776b4c4d5f89f7124b7bf79>`_.
+# If ``non_blocking=False`` (default), a ``cudaStreamSynchronize`` will be called after each and every ``cudaMemcpyAsync``, making
+# the call to :meth:`~torch.Tensor.to` blocking in the main thread.
+# If ``non_blocking=True``, no synchronization is triggered, and the main thread on the host is not blocked.
+# Therefore, from the host perspective, multiple tensors can be sent to the device simultaneously,
+# as the thread does not need to wait for one transfer to be completed to initiate the other.
+#
+# .. note:: In general, the transfer is blocking on the device side (even if it isn't on the host side):
+#   the copy on the device cannot occur while another operation is being executed.
+#   However, in some advanced scenarios, a copy and a kernel execution can be done simultaneously on the GPU side.
+#   As the following example will show, three requirements must be met to enable this:
+#
+#   1. The device must have at least one free DMA (Direct Memory Access) engine. Modern GPU architectures such as Volterra,
+#      Tesla, or H100 devices have more than one DMA engine.
+#
+#   2. The transfer must be done on a separate, non-default cuda stream. In PyTorch, cuda streams can be handles using
+#      :class:`~torch.cuda.Stream`.
+#
+#   3. The source data must be in pinned memory.
+#
+#   We demonstrate this by running profiles on the following script.
+#
+
+import contextlib
+
+from torch.cuda import Stream
+
+
+s = Stream()
+
+torch.manual_seed(42)
+t1_cpu_pinned = torch.randn(1024**2 * 5, pin_memory=True)
+t2_cpu_paged = torch.randn(1024**2 * 5, pin_memory=False)
+t3_cuda = torch.randn(1024**2 * 5, device="cuda:0")
+
+assert torch.cuda.is_available()
+device = torch.device("cuda", torch.cuda.current_device())
+
+
+# The function we want to profile
+def inner(pinned: bool, streamed: bool):
+    with torch.cuda.stream(s) if streamed else contextlib.nullcontext():
+        if pinned:
+            t1_cuda = t1_cpu_pinned.to(device, non_blocking=True)
+        else:
+            t2_cuda = t2_cpu_paged.to(device, non_blocking=True)
+        t_star_cuda_h2d_event = s.record_event()
+    # This operation can be executed during the CPU to GPU copy if and only if the tensor is pinned and the copy is
+    #  done in the other stream
+    t3_cuda_mul = t3_cuda * t3_cuda * t3_cuda
+    t3_cuda_h2d_event = torch.cuda.current_stream().record_event()
+    t_star_cuda_h2d_event.synchronize()
+    t3_cuda_h2d_event.synchronize()
+
+
+# Our profiler: profiles the `inner` function and stores the results in a .json file
+def benchmark_with_profiler(
+    pinned,
+    streamed,
+) -> None:
+    torch._C._profiler._set_cuda_sync_enabled_val(True)
+    wait, warmup, active = 1, 1, 2
+    num_steps = wait + warmup + active
+    rank = 0
+    with torch.profiler.profile(
+        activities=[
+            torch.profiler.ProfilerActivity.CPU,
+            torch.profiler.ProfilerActivity.CUDA,
+        ],
+        schedule=torch.profiler.schedule(
+            wait=wait, warmup=warmup, active=active, repeat=1, skip_first=1
+        ),
+    ) as prof:
+        for step_idx in range(1, num_steps + 1):
+            inner(streamed=streamed, pinned=pinned)
+            if rank is None or rank == 0:
+                prof.step()
+    prof.export_chrome_trace(f"trace_streamed{int(streamed)}_pinned{int(pinned)}.json")
+
+
+######################################################################
+# Loading these profile traces in chrome (``chrome://tracing``) shows the following results: first, let's see
+# what happens if both the arithmetic operation on ``t3_cuda`` is executed after the pageable tensor is sent to GPU
+# in the main stream:
+#
+
+benchmark_with_profiler(streamed=False, pinned=False)
+
+######################################################################
+# .. figure:: /_static/img/pinmem/trace_streamed0_pinned0.png
+#    :alt:
+#
+# Using a pinned tensor doesn't change the trace much, both operations are still executed consecutively:
+
+benchmark_with_profiler(streamed=False, pinned=True)
+
+######################################################################
+#
+# .. figure:: /_static/img/pinmem/trace_streamed0_pinned1.png
+#    :alt:
+#
+# Sending a pageable tensor to GPU on a separate stream is also a blocking operation:
+
+benchmark_with_profiler(streamed=True, pinned=False)
+
+######################################################################
+#
+# .. figure:: /_static/img/pinmem/trace_streamed1_pinned0.png
+#    :alt:
+#
+# Only pinned tensors copies to GPU on a separate stream overlap with another cuda kernel executed on
+# the main stream:
+
+benchmark_with_profiler(streamed=True, pinned=True)
+
+######################################################################
+#
+# .. figure:: /_static/img/pinmem/trace_streamed1_pinned1.png
+#    :alt:
+#
+# A PyTorch perspective
+# ---------------------
+#
+#   .. _pinned_memory_pt_perspective:
+#
+# ``pin_memory()``
+# ~~~~~~~~~~~~~~~~
+#
+#   .. _pinned_memory_pinned:
+#
+# PyTorch offers the possibility to create and send tensors to page-locked memory through the
+# :meth:`~torch.Tensor.pin_memory` method and constructor arguments.
+# CPU tensors on a machine where CUDA is initialized can be cast to pinned memory through the :meth:`~torch.Tensor.pin_memory`
+# method. Importantly, ``pin_memory`` is blocking on the main thread of the host: it will wait for the tensor to be copied to
+# page-locked memory before executing the next operation.
+# New tensors can be directly created in pinned memory with functions like :func:`~torch.zeros`, :func:`~torch.ones` and other
+# constructors.
+#
+# Let us check the speed of pinning memory and sending tensors to CUDA:
+
+
+import torch
+import gc
+from torch.utils.benchmark import Timer
+import matplotlib.pyplot as plt
+
+
+def timer(cmd):
+    median = (
+        Timer(cmd, globals=globals())
+        .adaptive_autorange(min_run_time=1.0, max_run_time=20.0)
+        .median
+        * 1000
+    )
+    print(f"{cmd}: {median: 4.4f} ms")
+    return median
+
+
+# A tensor in pageable memory
+pageable_tensor = torch.randn(1_000_000)
+
+# A tensor in page-locked (pinned) memory
+pinned_tensor = torch.randn(1_000_000, pin_memory=True)
+
+# Runtimes:
+pageable_to_device = timer("pageable_tensor.to('cuda:0')")
+pinned_to_device = timer("pinned_tensor.to('cuda:0')")
+pin_mem = timer("pageable_tensor.pin_memory()")
+pin_mem_to_device = timer("pageable_tensor.pin_memory().to('cuda:0')")
+
+# Ratios:
+r1 = pinned_to_device / pageable_to_device
+r2 = pin_mem_to_device / pageable_to_device
+
+# Create a figure with the results
+fig, ax = plt.subplots()
+
+xlabels = [0, 1, 2]
+bar_labels = [
+    "pageable_tensor.to(device) (1x)",
+    f"pinned_tensor.to(device) ({r1:4.2f}x)",
+    f"pageable_tensor.pin_memory().to(device) ({r2:4.2f}x)"
+    f"\npin_memory()={100*pin_mem/pin_mem_to_device:.2f}% of runtime.",
+]
+values = [pageable_to_device, pinned_to_device, pin_mem_to_device]
+colors = ["tab:blue", "tab:red", "tab:orange"]
+ax.bar(xlabels, values, label=bar_labels, color=colors)
+
+ax.set_ylabel("Runtime (ms)")
+ax.set_title("Device casting runtime (pin-memory)")
+ax.set_xticks([])
+ax.legend()
+
+plt.show()
+
+# Clear tensors
+del pageable_tensor, pinned_tensor
+_ = gc.collect()
+
+######################################################################
+#
+# We can observe that casting a pinned-memory tensor to GPU is indeed much faster than a pageable tensor, because under
+# the hood, a pageable tensor must be copied to pinned memory before being sent to GPU.
+#
+# However, contrary to a somewhat common belief, calling :meth:`~torch.Tensor.pin_memory()` on a pageable tensor before
+# casting it to GPU should not bring any significant speed-up, on the contrary this call is usually slower than just
+# executing the transfer. This makes sense, since we're actually asking Python to execute an operation that CUDA will
+# perform anyway before copying the data from host to device.
+#
+# .. note:: The PyTorch implementation of
+#   `pin_memory <https://github.com/pytorch/pytorch/blob/5298acb5c76855bc5a99ae10016efc86b27949bd/aten/src/ATen/native/Memory.cpp#L58>`_
+#   which relies on creating a brand new storage in pinned memory through `cudaHostAlloc <https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1gb65da58f444e7230d3322b6126bb4902>`_
+#   could be, in rare cases, faster than transitioning data in chunks as ``cudaMemcpy`` does.
+#   Here too, the observation may vary depending on the available hardware, the size of the tensors being sent or
+#   the amount of available RAM.
+#
+# ``non_blocking=True``
+# ~~~~~~~~~~~~~~~~~~~~~
+#
+#   .. _pinned_memory_non_blocking:
+#
+# As mentioned earlier, many PyTorch operations have the option of being executed asynchronously with respect to the host
+# through the ``non_blocking`` argument.
+#
+# Here, to account accurately of the benefits of using ``non_blocking``, we will design a slightly more complex
+# experiment since we want to assess how fast it is to send multiple tensors to GPU with and without calling
+# ``non_blocking``.
+#
+
+
+# A simple loop that copies all tensors to cuda
+def copy_to_device(*tensors):
+    result = []
+    for tensor in tensors:
+        result.append(tensor.to("cuda:0"))
+    return result
+
+
+# A loop that copies all tensors to cuda asynchronously
+def copy_to_device_nonblocking(*tensors):
+    result = []
+    for tensor in tensors:
+        result.append(tensor.to("cuda:0", non_blocking=True))
+    # We need to synchronize
+    torch.cuda.synchronize()
+    return result
+
+
+# Create a list of tensors
+tensors = [torch.randn(1000) for _ in range(1000)]
+to_device = timer("copy_to_device(*tensors)")
+to_device_nonblocking = timer("copy_to_device_nonblocking(*tensors)")
+
+# Ratio
+r1 = to_device_nonblocking / to_device
+
+# Plot the results
+fig, ax = plt.subplots()
+
+xlabels = [0, 1]
+bar_labels = [f"to(device) (1x)", f"to(device, non_blocking=True) ({r1:4.2f}x)"]
+colors = ["tab:blue", "tab:red"]
+values = [to_device, to_device_nonblocking]
+
+ax.bar(xlabels, values, label=bar_labels, color=colors)
+
+ax.set_ylabel("Runtime (ms)")
+ax.set_title("Device casting runtime (non-blocking)")
+ax.set_xticks([])
+ax.legend()
+
+plt.show()
+
+
+######################################################################
+# To get a better sense of what is happening here, let us profile these two functions:
+
+
+from torch.profiler import profile, ProfilerActivity
+
+
+def profile_mem(cmd):
+    with profile(activities=[ProfilerActivity.CPU]) as prof:
+        exec(cmd)
+    print(cmd)
+    print(prof.key_averages().table(row_limit=10))
+
+
+######################################################################
+# Let's see the call stack with a regular ``to(device)`` first:
+#
+
+print("Call to `to(device)`", profile_mem("copy_to_device(*tensors)"))
+
+######################################################################
+# and now the ``non_blocking`` version:
+#
+
+print(
+    "Call to `to(device, non_blocking=True)`",
+    profile_mem("copy_to_device_nonblocking(*tensors)"),
+)
+
+
+######################################################################
+# The results are without any doubt better when using ``non_blocking=True``, as all transfers are initiated simultaneously
+# on the host side and only one synchronization is done.
+#
+# The benefit will vary depending on the number and the size of the tensors as well as depending on the hardware being
+# used.
+#
+# .. note:: Interestingly, the blocking ``to("cuda")`` actually performs the same asynchronous device casting operation
+#   (``cudaMemcpyAsync``) as the one with ``non_blocking=True`` with a synchronization point after each copy.
+#
+# Synergies
+# ~~~~~~~~~
+#
+#   .. _pinned_memory_synergies:
+#
+# Now that we have made the point that data transfer of tensors already in pinned memory to GPU is faster than from
+# pageable memory, and that we know that doing these transfers asynchronously is also faster than synchronously, we can
+# benchmark combinations of these approaches. First, let's write a couple of new functions that will call ``pin_memory``
+# and ``to(device)`` on each tensor:
+#
+
+
+def pin_copy_to_device(*tensors):
+    result = []
+    for tensor in tensors:
+        result.append(tensor.pin_memory().to("cuda:0"))
+    return result
+
+
+def pin_copy_to_device_nonblocking(*tensors):
+    result = []
+    for tensor in tensors:
+        result.append(tensor.pin_memory().to("cuda:0", non_blocking=True))
+    # We need to synchronize
+    torch.cuda.synchronize()
+    return result
+
+
+######################################################################
+# The benefits of using :meth:`~torch.Tensor.pin_memory` are more pronounced for
+# somewhat large batches of large tensors:
+#
+
+tensors = [torch.randn(1_000_000) for _ in range(1000)]
+page_copy = timer("copy_to_device(*tensors)")
+page_copy_nb = timer("copy_to_device_nonblocking(*tensors)")
+
+tensors_pinned = [torch.randn(1_000_000, pin_memory=True) for _ in range(1000)]
+pinned_copy = timer("copy_to_device(*tensors_pinned)")
+pinned_copy_nb = timer("copy_to_device_nonblocking(*tensors_pinned)")
+
+pin_and_copy = timer("pin_copy_to_device(*tensors)")
+pin_and_copy_nb = timer("pin_copy_to_device_nonblocking(*tensors)")
+
+# Plot
+strategies = ("pageable copy", "pinned copy", "pin and copy")
+blocking = {
+    "blocking": [page_copy, pinned_copy, pin_and_copy],
+    "non-blocking": [page_copy_nb, pinned_copy_nb, pin_and_copy_nb],
+}
+
+x = torch.arange(3)
+width = 0.25
+multiplier = 0
+
+
+fig, ax = plt.subplots(layout="constrained")
+
+for attribute, runtimes in blocking.items():
+    offset = width * multiplier
+    rects = ax.bar(x + offset, runtimes, width, label=attribute)
+    ax.bar_label(rects, padding=3, fmt="%.2f")
+    multiplier += 1
+
+# Add some text for labels, title and custom x-axis tick labels, etc.
+ax.set_ylabel("Runtime (ms)")
+ax.set_title("Runtime (pin-mem and non-blocking)")
+ax.set_xticks([0, 1, 2])
+ax.set_xticklabels(strategies)
+plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")
+ax.legend(loc="upper left", ncols=3)
+
+plt.show()
+
+del tensors, tensors_pinned
+_ = gc.collect()
+
+
+######################################################################
+# Other copy directions (GPU -> CPU, CPU -> MPS)
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+#   .. _pinned_memory_other_direction:
+#
+# Until now, we have operated under the assumption that asynchronous copies from the CPU to the GPU are safe.
+# This is generally true because CUDA automatically handles synchronization to ensure that the data being accessed is
+# valid at read time __whenever the tensor is in pageable memory__.
+#
+# However, in other cases we cannot make the same assumption: when a tensor is placed in pinned memory, mutating the
+# original copy after calling the host-to-device transfer may corrupt the data received on GPU.
+# Similarly, when a transfer is achieved in the opposite direction, from GPU to CPU, or from any device that is not CPU
+# or GPU to any device that is not a CUDA-handled GPU (such as, MPS), there is no guarantee that the data read on GPU is
+# valid without explicit synchronization.
+#
+# In these scenarios, these transfers offer no assurance that the copy will be complete at the time of
+# data access. Consequently, the data on the host might be incomplete or incorrect, effectively rendering it garbage.
+#
+# Let's first demonstrate this with a pinned-memory tensor:
+DELAY = 100000000
+try:
+    i = -1
+    for i in range(100):
+        # Create a tensor in pin-memory
+        cpu_tensor = torch.ones(1024, 1024, pin_memory=True)
+        torch.cuda.synchronize()
+        # Send the tensor to CUDA
+        cuda_tensor = cpu_tensor.to("cuda", non_blocking=True)
+        torch.cuda._sleep(DELAY)
+        # Corrupt the original tensor
+        cpu_tensor.zero_()
+        assert (cuda_tensor == 1).all()
+    print("No test failed with non_blocking and pinned tensor")
+except AssertionError:
+    print(f"{i}th test failed with non_blocking and pinned tensor. Skipping remaining tests")
+
+######################################################################
+# Using a pageable tensor always works:
+#
+
+i = -1
+for i in range(100):
+    # Create a tensor in pageable memory
+    cpu_tensor = torch.ones(1024, 1024)
+    torch.cuda.synchronize()
+    # Send the tensor to CUDA
+    cuda_tensor = cpu_tensor.to("cuda", non_blocking=True)
+    torch.cuda._sleep(DELAY)
+    # Corrupt the original tensor
+    cpu_tensor.zero_()
+    assert (cuda_tensor == 1).all()
+print("No test failed with non_blocking and pageable tensor")
+
+######################################################################
+# Now let's demonstrate that CUDA to CPU also fails to produce reliable outputs without synchronization:
+
+tensor = (
+    torch.arange(1, 1_000_000, dtype=torch.double, device="cuda")
+    .expand(100, 999999)
+    .clone()
+)
+torch.testing.assert_close(
+    tensor.mean(), torch.tensor(500_000, dtype=torch.double, device="cuda")
+), tensor.mean()
+try:
+    i = -1
+    for i in range(100):
+        cpu_tensor = tensor.to("cpu", non_blocking=True)
+        torch.testing.assert_close(
+            cpu_tensor.mean(), torch.tensor(500_000, dtype=torch.double)
+        )
+    print("No test failed with non_blocking")
+except AssertionError:
+    print(f"{i}th test failed with non_blocking. Skipping remaining tests")
+try:
+    i = -1
+    for i in range(100):
+        cpu_tensor = tensor.to("cpu", non_blocking=True)
+        torch.cuda.synchronize()
+        torch.testing.assert_close(
+            cpu_tensor.mean(), torch.tensor(500_000, dtype=torch.double)
+        )
+    print("No test failed with synchronize")
+except AssertionError:
+    print(f"One test failed with synchronize: {i}th assertion!")
+
+
+######################################################################
+# Generally, asynchronous copies to a device are safe without explicit synchronization only when the target is a
+# CUDA-enabled device and the original tensor is in pageable memory.
+#
+# In summary, copying data from CPU to GPU is safe when using ``non_blocking=True``, but for any other direction,
+# ``non_blocking=True`` can still be used but the user must make sure that a device synchronization is executed before
+# the data is accessed.
+#
+# Practical recommendations
+# -------------------------
+#
+#   .. _pinned_memory_recommendations:
+#
+# We can now wrap up some early recommendations based on our observations:
+#
+# In general, ``non_blocking=True`` will provide good throughput, regardless of whether the original tensor is or
+# isn't in pinned memory.
+# If the tensor is already in pinned memory, the transfer can be accelerated, but sending it to
+# pin memory manually from python main thread is a blocking operation on the host, and hence will annihilate much of
+# the benefit of using ``non_blocking=True`` (as CUDA does the `pin_memory` transfer anyway).
+#
+# One might now legitimately ask what use there is for the :meth:`~torch.Tensor.pin_memory` method.
+# In the following section, we will explore further how this can be used to accelerate the data transfer even more.
+#
+# Additional considerations
+# -------------------------
+#
+#   .. _pinned_memory_considerations:
+#
+# PyTorch notoriously provides a :class:`~torch.utils.data.DataLoader` class whose constructor accepts a
+# ``pin_memory`` argument.
+# Considering our previous discussion on ``pin_memory``, you might wonder how the ``DataLoader`` manages to
+# accelerate data transfers if memory pinning is inherently blocking.
+#
+# The key lies in the DataLoader's use of a separate thread to handle the transfer of data from pageable to pinned
+# memory, thus preventing any blockage in the main thread.
+#
+# To illustrate this, we will use the TensorDict primitive from the homonymous library.
+# When invoking :meth:`~tensordict.TensorDict.to`, the default behavior is to send tensors to the device asynchronously,
+# followed by a single call to ``torch.device.synchronize()`` afterwards.
+#
+# Additionally, ``TensorDict.to()`` includes a ``non_blocking_pin`` option  which initiates multiple threads to execute
+# ``pin_memory()`` before proceeding with to ``to(device)``.
+# This approach can further accelerate data transfers, as demonstrated in the following example.
+#
+#
+
+from tensordict import TensorDict
+import torch
+from torch.utils.benchmark import Timer
+import matplotlib.pyplot as plt
+
+# Create the dataset
+td = TensorDict({str(i): torch.randn(1_000_000) for i in range(1000)})
+
+# Runtimes
+copy_blocking = timer("td.to('cuda:0', non_blocking=False)")
+copy_non_blocking = timer("td.to('cuda:0')")
+copy_pin_nb = timer("td.to('cuda:0', non_blocking_pin=True, num_threads=0)")
+copy_pin_multithread_nb = timer("td.to('cuda:0', non_blocking_pin=True, num_threads=4)")
+
+# Rations
+r1 = copy_non_blocking / copy_blocking
+r2 = copy_pin_nb / copy_blocking
+r3 = copy_pin_multithread_nb / copy_blocking
+
+# Figure
+fig, ax = plt.subplots()
+
+xlabels = [0, 1, 2, 3]
+bar_labels = [
+    "Blocking copy (1x)",
+    f"Non-blocking copy ({r1:4.2f}x)",
+    f"Blocking pin, non-blocking copy ({r2:4.2f}x)",
+    f"Non-blocking pin, non-blocking copy ({r3:4.2f}x)",
+]
+values = [copy_blocking, copy_non_blocking, copy_pin_nb, copy_pin_multithread_nb]
+colors = ["tab:blue", "tab:red", "tab:orange", "tab:green"]
+
+ax.bar(xlabels, values, label=bar_labels, color=colors)
+
+ax.set_ylabel("Runtime (ms)")
+ax.set_title("Device casting runtime")
+ax.set_xticks([])
+ax.legend()
+
+plt.show()
+
+######################################################################
+# In this example, we are transferring many large tensors from the CPU to the GPU.
+# This scenario is ideal for utilizing multithreaded ``pin_memory()``, which can significantly enhance performance.
+# However, if the tensors are small, the overhead associated with multithreading may outweigh the benefits.
+# Similarly, if there are only a few tensors, the advantages of pinning tensors on separate threads become limited.
+#
+# As an additional note, while it might seem advantageous to create permanent buffers in pinned memory to shuttle
+# tensors from pageable memory before transferring them to the GPU, this strategy does not necessarily expedite
+# computation. The inherent bottleneck caused by copying data into pinned memory remains a limiting factor.
+#
+# Moreover, transferring data that resides on disk (whether in shared memory or files) to the GPU typically requires an
+# intermediate step of copying the data into pinned memory (located in RAM).
+# Utilizing non_blocking for large data transfers in this context can significantly increase RAM consumption,
+# potentially leading to adverse effects.
+#
+# In practice, there is no one-size-fits-all solution.
+# The effectiveness of using multithreaded ``pin_memory`` combined with ``non_blocking`` transfers depends on a
+# variety of  factors, including the specific system, operating system, hardware, and the nature of the tasks
+# being executed.
+# Here is a list of factors to check when trying to speed-up data transfers between CPU and GPU, or comparing
+# throughput's across scenarios:
+#
+# - **Number of available cores**
+#
+#   How many CPU cores are available? Is the system shared with other users or processes that might compete for
+#   resources?
+#
+# - **Core utilization**
+#
+#   Are the CPU cores heavily utilized by other processes? Does the application perform other CPU-intensive tasks
+#   concurrently with data transfers?
+#
+# - **Memory utilization**
+#
+#   How much pageable and page-locked memory is currently being used? Is there sufficient free memory to allocate
+#   additional pinned memory without affecting system performance? Remember that nothing comes for free, for instance
+#   ``pin_memory`` will consume RAM and may impact other tasks.
+#
+# - **CUDA Device Capabilities**
+#
+#   Does the GPU support multiple DMA engines for concurrent data transfers? What are the specific capabilities and
+#   limitations of the CUDA device being used?
+#
+# - **Number of tensors to be sent**
+#
+#   How many tensors are transferred in a typical operation?
+#
+# - **Size of the tensors to be sent**
+#
+#   What is the size of the tensors being transferred? A few large tensors or many small tensors may not benefit from
+#   the same transfer program.
+#
+# - **System Architecture**
+#
+#   How is the system's architecture influencing data transfer speeds (for example, bus speeds, network latency)?
+#
+# Additionally, allocating a large number of tensors or sizable tensors in pinned memory can monopolize a substantial
+# portion of RAM.
+# This reduces the available memory for other critical operations, such as paging, which can negatively impact the
+# overall performance of an algorithm.
+#
+# Conclusion
+# ----------
+#
+#   .. _pinned_memory_conclusion:
+#
+# Throughout this tutorial, we have explored several critical factors that influence transfer speeds and memory
+# management when sending tensors from the host to the device. We've learned that using ``non_blocking=True`` generally
+# accelerates data transfers, and that :meth:`~torch.Tensor.pin_memory` can also enhance performance if implemented
+# correctly. However, these techniques require careful design and calibration to be effective.
+#
+# Remember that profiling your code and keeping an eye on the memory consumption are essential to optimize resource
+# usage and achieve the best possible performance.
+#
+# Additional resources
+# --------------------
+#
+#   .. _pinned_memory_resources:
+#
+# If you are dealing with issues with memory copies when using CUDA devices or want to learn more about
+# what was discussed in this tutorial, check the following references:
+#
+# - `CUDA toolkit memory management doc <https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html>`_;
+# - `CUDA pin-memory note <https://forums.developer.nvidia.com/t/pinned-memory/268474>`_;
+# - `How to Optimize Data Transfers in CUDA C/C++ <https://developer.nvidia.com/blog/how-optimize-data-transfers-cuda-cc/>`_;
+# - `tensordict doc <https://pytorch.org/tensordict/stable/index.html>`_ and `repo <https://github.com/pytorch/tensordict>`_.
+#
diff --git a/intermediate_source/pipeline_tutorial.rst b/intermediate_source/pipeline_tutorial.rst
new file mode 100644
index 00000000000..06f10a4a884
--- /dev/null
+++ b/intermediate_source/pipeline_tutorial.rst
@@ -0,0 +1,11 @@
+Training Transformer models using Pipeline Parallelism
+======================================================
+
+This tutorial has been deprecated.
+
+Redirecting to the latest parallelism APIs in 3 seconds...
+
+.. raw:: html
+
+   <meta http-equiv="Refresh" content="3; url='https://pytorch.org/tutorials/intermediate/pipelining_tutorial.html'" />
+
diff --git a/intermediate_source/pipelining_tutorial.rst b/intermediate_source/pipelining_tutorial.rst
new file mode 100644
index 00000000000..36738011a40
--- /dev/null
+++ b/intermediate_source/pipelining_tutorial.rst
@@ -0,0 +1,240 @@
+Introduction to Distributed Pipeline Parallelism
+================================================
+**Authors**: `Howard Huang <https://github.com/H-Huang>`_
+
+.. note::
+   |edit| View and edit this tutorial in `github <https://github.com/pytorch/tutorials/blob/main/intermediate_source/pipelining_tutorial.rst>`__.
+
+This tutorial uses a gpt-style transformer model to demonstrate implementing distributed
+pipeline parallelism with `torch.distributed.pipelining <https://pytorch.org/docs/main/distributed.pipelining.html>`__
+APIs.
+
+.. grid:: 2
+
+   .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn
+      :class-card: card-prerequisites
+
+      *  How to use ``torch.distributed.pipelining`` APIs
+      *  How to apply pipeline parallelism to a transformer model
+      *  How to utilize different schedules on a set of microbatches
+
+
+   .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites
+      :class-card: card-prerequisites
+
+      * Familiarity with `basic distributed training  <https://pytorch.org/tutorials/beginner/dist_overview.html>`__ in PyTorch
+
+Setup
+-----
+
+With ``torch.distributed.pipelining`` we will be partitioning the execution of a model and scheduling computation on micro-batches. We will be using a simplified version
+of a transformer decoder model. The model architecture is for educational purposes and has multiple transformer decoder layers as we want to demonstrate how to split the model into different
+chunks. First, let us define the model:
+
+.. code:: python
+
+   import torch
+   import torch.nn as nn
+   from dataclasses import dataclass
+
+   @dataclass
+   class ModelArgs:
+      dim: int = 512
+      n_layers: int = 8
+      n_heads: int = 8
+      vocab_size: int = 10000
+
+   class Transformer(nn.Module):
+      def __init__(self, model_args: ModelArgs):
+         super().__init__()
+
+         self.tok_embeddings = nn.Embedding(model_args.vocab_size, model_args.dim)
+
+         # Using a ModuleDict lets us delete layers witout affecting names,
+         # ensuring checkpoints will correctly save and load.
+         self.layers = torch.nn.ModuleDict()
+         for layer_id in range(model_args.n_layers):
+               self.layers[str(layer_id)] = nn.TransformerDecoderLayer(model_args.dim, model_args.n_heads)
+
+         self.norm = nn.LayerNorm(model_args.dim)
+         self.output = nn.Linear(model_args.dim, model_args.vocab_size)
+
+      def forward(self, tokens: torch.Tensor):
+         # Handling layers being 'None' at runtime enables easy pipeline splitting
+         h = self.tok_embeddings(tokens) if self.tok_embeddings else tokens
+
+         for layer in self.layers.values():
+               h = layer(h, h)
+
+         h = self.norm(h) if self.norm else h
+         output = self.output(h).clone() if self.output else h
+         return output
+
+Then, we need to import the necessary libraries in our script and initialize the distributed training process. In this case, we are defining some global variables to use
+later in the script:
+
+.. code:: python
+
+   import os
+   import torch.distributed as dist
+   from torch.distributed.pipelining import pipeline, SplitPoint, PipelineStage, ScheduleGPipe
+
+   global rank, device, pp_group, stage_index, num_stages
+   def init_distributed():
+      global rank, device, pp_group, stage_index, num_stages
+      rank = int(os.environ["LOCAL_RANK"])
+      world_size = int(os.environ["WORLD_SIZE"])
+      device = torch.device(f"cuda:{rank}") if torch.cuda.is_available() else torch.device("cpu")
+      dist.init_process_group()
+
+      # This group can be a sub-group in the N-D parallel case
+      pp_group = dist.new_group()
+      stage_index = rank
+      num_stages = world_size
+
+The ``rank``, ``world_size``, and ``init_process_group()`` code should seem familiar to you as those are commonly used in
+all distributed programs. The globals specific to pipeline parallelism include ``pp_group`` which is the process
+group that will be used for send/recv communications, ``stage_index`` which, in this example, is a single rank
+per stage so the index is equivalent to the rank, and ``num_stages`` which is equivalent to world_size.
+
+The ``num_stages`` is used to set the number of stages that will be used in the pipeline parallelism schedule. For example,
+for ``num_stages=4``, a microbatch will need to go through 4 forwards and 4 backwards before it is completed. The ``stage_index``
+is necessary for the framework to know how to communicate between stages. For example, for the first stage (``stage_index=0``), it will
+use data from the dataloader and does not need to receive data from any previous peers to perform its computation.
+
+
+Step 1: Partition the Transformer Model
+---------------------------------------
+
+There are two different ways of partitioning the model:
+
+First is the manual mode in which we can manually create two instances of the model by deleting portions of
+attributes of the model. In this example for two stages (2 ranks), the model is cut in half.
+
+.. code:: python
+
+   def manual_model_split(model) -> PipelineStage:
+      if stage_index == 0:
+         # prepare the first stage model
+         for i in range(4, 8):
+               del model.layers[str(i)]
+         model.norm = None
+         model.output = None
+
+      elif stage_index == 1:
+         # prepare the second stage model
+         for i in range(4):
+               del model.layers[str(i)]
+         model.tok_embeddings = None
+
+      stage = PipelineStage(
+         model,
+         stage_index,
+         num_stages,
+         device,
+      )
+      return stage
+
+As we can see the first stage does not have the layer norm or the output layer, and it only includes the first four transformer blocks.
+The second stage does not have the input embedding layers, but includes the output layers and the final four transformer blocks. The function
+then returns the ``PipelineStage`` for the current rank.
+
+The second method is the tracer-based mode which automatically splits the model based on a ``split_spec`` argument. Using the pipeline specification, we can instruct
+``torch.distributed.pipelining`` where to split the model. In the following code block,
+we are splitting before the before 4th transformer decoder layer, mirroring the manual split described above. Similarly,
+we can retrieve a ``PipelineStage`` by calling ``build_stage`` after this splitting is done.
+
+.. code:: python
+   def tracer_model_split(model, example_input_microbatch) -> PipelineStage:
+      pipe = pipeline(
+         module=model,
+         mb_args=(example_input_microbatch,),
+         split_spec={
+            "layers.4": SplitPoint.BEGINNING,
+         }
+      )
+      stage = pipe.build_stage(stage_index, device, pp_group)
+      return stage
+
+
+Step 2: Define The Main Execution
+---------------------------------
+
+In the main function we will create a particular pipeline schedule that the stages should follow. ``torch.distributed.pipelining``
+supports multiple schedules including supports multiple schedules, including single-stage-per-rank schedules ``GPipe`` and ``1F1B``,
+as well as multiple-stage-per-rank schedules such as ``Interleaved1F1B`` and ``LoopedBFS``.
+
+.. code:: python
+
+   if __name__ == "__main__":
+      init_distributed()
+      num_microbatches = 4
+      model_args = ModelArgs()
+      model = Transformer(model_args)
+
+      # Dummy data
+      x = torch.ones(32, 500, dtype=torch.long)
+      y = torch.randint(0, model_args.vocab_size, (32, 500), dtype=torch.long)
+      example_input_microbatch = x.chunk(num_microbatches)[0]
+
+      # Option 1: Manual model splitting
+      stage = manual_model_split(model)
+
+      # Option 2: Tracer model splitting
+      # stage = tracer_model_split(model, example_input_microbatch)
+
+      model.to(device)
+      x = x.to(device)
+      y = y.to(device)
+
+      def tokenwise_loss_fn(outputs, targets):
+         loss_fn = nn.CrossEntropyLoss()
+         outputs = outputs.reshape(-1, model_args.vocab_size)
+         targets = targets.reshape(-1)
+         return loss_fn(outputs, targets)
+
+      schedule = ScheduleGPipe(stage, n_microbatches=num_microbatches, loss_fn=tokenwise_loss_fn)
+
+      if rank == 0:
+         schedule.step(x)
+      elif rank == 1:
+         losses = []
+         output = schedule.step(target=y, losses=losses)
+         print(f"losses: {losses}")
+      dist.destroy_process_group()
+
+In the example above, we are using the manual method to split the model, but the code can be uncommented to also try the
+tracer-based model splitting function. In our schedule, we need to pass in the number of microbatches and
+the loss function used to evaluate the targets.
+
+The ``.step()`` function processes the entire minibatch and automatically splits it into microbatches based
+on the ``n_microbatches`` passed previously. The microbatches are then operated on according to the schedule class.
+In the example above, we are using GPipe, which follows a simple all-forwards and then all-backwards schedule. The output
+returned from rank 1 will be the same as if the model was on a single GPU and run with the entire batch. Similarly,
+we can pass in a ``losses`` container to store the corresponding losses for each microbatch.
+
+Step 3: Launch the Distributed Processes
+----------------------------------------
+
+Finally, we are ready to run the script. We will use ``torchrun`` to create a single host, 2-process job.
+Our script is already written in a way rank 0 that performs the required logic for pipeline stage 0, and rank 1
+performs the logic for pipeline stage 1.
+
+``torchrun --nnodes 1 --nproc_per_node 2 pipelining_tutorial.py``
+
+Conclusion
+----------
+
+In this tutorial, we have learned how to implement distributed pipeline parallelism using PyTorch's ``torch.distributed.pipelining`` APIs.
+We explored setting up the environment, defining a transformer model, and partitioning it for distributed training.
+We discussed two methods of model partitioning, manual and tracer-based, and demonstrated how to schedule computations on
+micro-batches across different stages. Finally, we covered the execution of the pipeline schedule and the launch of distributed
+processes using ``torchrun``.
+
+Additional Resources
+--------------------
+
+We have successfully integrated ``torch.distributed.pipelining`` into the `torchtitan repository <https://github.com/pytorch/torchtitan>`__. TorchTitan is a clean, minimal code base for
+large-scale LLM training using native PyTorch. For a production ready usage of pipeline
+parallelism as well as composition with other distributed techniques, see
+`TorchTitan end to end example of 3D parallelism <https://github.com/pytorch/torchtitan>`__.
diff --git a/intermediate_source/process_group_cpp_extension_tutorial.rst b/intermediate_source/process_group_cpp_extension_tutorial.rst
new file mode 100644
index 00000000000..3c72a9e319b
--- /dev/null
+++ b/intermediate_source/process_group_cpp_extension_tutorial.rst
@@ -0,0 +1,306 @@
+Customize Process Group Backends Using Cpp Extensions
+=====================================================
+
+**Author**: `Howard Huang <https://github.com/H-Huang>`__, `Feng Tian <https://github.com/ftian1>`__, `Shen Li <https://mrshenli.github.io/>`__, `Min Si <https://minsii.github.io/>`__
+
+.. note::
+   |edit| View and edit this tutorial in `github <https://github.com/pytorch/tutorials/blob/main/intermediate_source/process_group_cpp_extension_tutorial.rst>`__.
+
+Prerequisites:
+
+-  `PyTorch Distributed Overview <../beginner/dist_overview.html>`__
+-  `PyTorch Collective Communication Package <https://pytorch.org/docs/stable/distributed.html>`__
+-  `PyTorch Cpp Extension <https://pytorch.org/docs/stable/cpp_extension.html>`__
+-  `Writing Distributed Applications with PyTorch <https://pytorch.org/tutorials/intermediate/dist_tuto.html>`__
+
+This tutorial demonstrates how to implement a custom ``Backend`` and plug that into
+`PyTorch distributed package <https://pytorch.org/docs/stable/distributed.html>`__ using
+`cpp extensions <https://pytorch.org/docs/stable/cpp_extension.html>`__. This is helpful when you need a specialized software
+stack for your hardware, or when you would like to experiment with new
+collective communication algorithms.
+
+
+Basics
+------
+
+PyTorch collective communications power several widely adopted distributed
+training features, including
+`DistributedDataParallel <https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html>`__ and
+`ZeroRedundancyOptimizer <https://pytorch.org/docs/stable/distributed.optim.html#torch.distributed.optim.ZeroRedundancyOptimizer>`__.
+In order to make the same collective communication API work with
+different communication backends, the distributed package abstracts collective
+communication operations into a
+`Backend <https://github.com/pytorch/pytorch/blob/main/torch/csrc/distributed/c10d/Backend.hpp>`__
+class. Different backends can
+then be implemented as subclasses of ``Backend`` using preferred
+third-party libraries. PyTorch distributed comes with three default backends,
+``ProcessGroupNCCL``, ``ProcessGroupGloo``, and ``ProcessGroupMPI``. However,
+beyond these three backends, there are also other communication libraries
+(e.g., `UCC <https://github.com/openucx/ucc>`__,
+`OneCCL <https://github.com/oneapi-src/oneCCL>`__), different types of hardware
+(e.g., `TPU <https://cloud.google.com/tpu>`__,
+`Trainum <https://aws.amazon.com/machine-learning/trainium/>`__), and emerging
+communication algorithms (e.g.,
+`Herring <https://www.amazon.science/publications/herring-rethinking-the-parameter-server-at-scale-for-the-cloud>`__,
+`Reduction Server <https://cloud.google.com/blog/topics/developers-practitioners/optimize-training-performance-reduction-server-vertex-ai>`__).
+Therefore, the distributed package exposes extension APIs to allow customizing
+collective communication backends.
+
+
+The 4 steps below show how to implement a dummy ``Backend`` backend
+and use that in Python application code. Please note that this tutorial focuses
+on demonstrating the extension APIs, instead of developing a functioning
+communication backend. Hence, the ``dummy`` backend just covers a subset of the
+APIs (``all_reduce`` and ``all_gather``), and simply sets the values of tensors
+to 0.
+
+
+Step 1: Implement a Subclass of ``Backend``
+------------------------------------------------
+
+This first step is to implement a ``Backend`` subclass that overrides
+target collective communication APIs and runs the custom communication algorithm.
+The extension also needs to implement a ``Work`` subclass, which
+serves as a future of communication results and allows asynchronous execution in
+application code. If the extension uses third-party libraries, it can
+include the headers and call into the library APIs from the ``BackendDummy``
+subclass. The two code snippets below present the implementation of ``dummy.h`` and
+``dummy.cpp``. See the `dummy collectives <https://github.com/H-Huang/torch_collective_extension>`__
+repository for the full implementation.
+
+.. code-block:: cpp
+
+    // file name: dummy.hpp
+    #include <torch/python.h>
+
+    #include <torch/csrc/distributed/c10d/Backend.hpp>
+    #include <torch/csrc/distributed/c10d/Work.hpp>
+    #include <torch/csrc/distributed/c10d/Store.hpp>
+    #include <torch/csrc/distributed/c10d/Types.hpp>
+    #include <torch/csrc/distributed/c10d/Utils.hpp>
+
+    #include <pybind11/chrono.h>
+
+    namespace c10d {
+
+    class BackendDummy : public Backend {
+      public:
+        BackendDummy(int rank, int size);
+
+        c10::intrusive_ptr<Work> allgather(
+            std::vector<std::vector<at::Tensor>>& outputTensors,
+            std::vector<at::Tensor>& inputTensors,
+            const AllgatherOptions& opts = AllgatherOptions()) override;
+
+        c10::intrusive_ptr<Work> allreduce(
+            std::vector<at::Tensor>& tensors,
+            const AllreduceOptions& opts = AllreduceOptions()) override;
+
+        // The collective communication APIs without a custom implementation
+        // will error out if invoked by application code.
+    };
+
+    class WorkDummy : public Work {
+      public:
+        WorkDummy(
+          OpType opType,
+          c10::intrusive_ptr<c10::ivalue::Future> future) // future of the output
+          : Work(
+              -1, // rank, only used by recvAnySource, irrelevant in this demo
+              opType),
+          future_(std::move(future)) {}
+        bool isCompleted() override;
+        bool isSuccess() const override;
+        bool wait(std::chrono::milliseconds timeout = kUnsetTimeout) override;
+        virtual c10::intrusive_ptr<c10::ivalue::Future> getFuture() override;
+
+      private:
+        c10::intrusive_ptr<c10::ivalue::Future> future_;
+    };
+    } // namespace c10d
+
+
+.. code-block:: cpp
+
+    // file name: dummy.cpp
+    #include "dummy.hpp"
+
+    namespace c10d {
+
+    // This is a dummy allgather that sets all output tensors to zero
+    // Modify the implementation to conduct real communication asynchronously
+    c10::intrusive_ptr<Work> BackendDummy::allgather(
+            std::vector<std::vector<at::Tensor>>& outputTensors,
+            std::vector<at::Tensor>& inputTensors,
+            const AllgatherOptions& /* unused */) {
+        for (auto& outputTensorVec : outputTensors) {
+            for (auto& outputTensor : outputTensorVec) {
+                outputTensor.zero_();
+            }
+        }
+
+        auto future = c10::make_intrusive<c10::ivalue::Future>(
+            c10::ListType::create(c10::ListType::create(c10::TensorType::get())));
+        future->markCompleted(c10::IValue(outputTensors));
+        return c10::make_intrusive<WorkDummy>(OpType::ALLGATHER, std::move(future));
+    }
+
+    // This is a dummy allreduce that sets all output tensors to zero
+    // Modify the implementation to conduct real communication asynchronously
+    c10::intrusive_ptr<Work> BackendDummy::allreduce(
+            std::vector<at::Tensor>& tensors,
+            const AllreduceOptions& opts) {
+        for (auto& tensor : tensors) {
+            tensor.zero_();
+        }
+
+        auto future = c10::make_intrusive<c10::ivalue::Future>(
+            c10::ListType::create(c10::TensorType::get()));
+        future->markCompleted(c10::IValue(tensors));
+        return c10::make_intrusive<WorkDummy>(OpType::ALLGATHER, std::move(future));
+    }
+    } // namespace c10d
+
+Step 2: Expose The Extension Python APIs
+----------------------------------------
+
+The backend constructors are called
+`from Python side <https://github.com/pytorch/pytorch/blob/v1.9.0/torch/distributed/distributed_c10d.py#L643-L650>`__,
+so the extension also needs to expose the constructor APIs to Python. This can
+be done by adding the following methods. In this example, ``store`` and
+``timeout`` are ignored by the ``BackendDummy`` instantiation method, as
+those are not used in this dummy implementation. However, real-world extensions
+should consider using the ``store`` to perform rendezvous and supporting the
+``timeout`` argument.
+
+.. code-block:: cpp
+
+    // file name: dummy.hpp
+    class BackendDummy : public Backend {
+        ...
+        <Step 1 code>
+        ...
+
+        static c10::intrusive_ptr<Backend> createBackendDummy(
+            const c10::intrusive_ptr<::c10d::Store>& store,
+            int rank,
+            int size,
+            const std::chrono::duration<float>& timeout);
+
+        static void BackendDummyConstructor() __attribute__((constructor)) {
+            py::object module = py::module::import("torch.distributed");
+            py::object register_backend =
+                module.attr("Backend").attr("register_backend");
+            // torch.distributed.Backend.register_backend will add `dummy` as a
+            // new valid backend.
+            register_backend("dummy", py::cpp_function(createBackendDummy));
+        }
+    }
+
+.. code-block:: cpp
+
+    // file name: dummy.cpp
+    c10::intrusive_ptr<Backend> BackendDummy::createBackendDummy(
+            const c10::intrusive_ptr<::c10d::Store>& /* unused */,
+            int rank,
+            int size,
+            const std::chrono::duration<float>& /* unused */) {
+        return c10::make_intrusive<BackendDummy>(rank, size);
+    }
+
+    PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+        m.def("createBackendDummy", &BackendDummy::createBackendDummy);
+    }
+
+
+Step 3: Build The Custom Extension
+----------------------------------
+
+Now, the extension source code files are ready. We can then use
+`cpp extensions <https://pytorch.org/docs/stable/cpp_extension.html>`__
+to build it. To do that, create a ``setup.py`` file that prepares the paths and
+commands. Then call ``python setup.py develop`` to install the extension.
+
+If the extension depends on third-party libraries, you can also specify
+``libraries_dirs`` and ``libraries`` to the cpp extension APIs. See the
+`torch ucc <https://github.com/openucx/torch-ucc>`__
+project as a real-world example.
+
+.. code-block:: python
+
+    # file name: setup.py
+    import os
+    import sys
+    import torch
+    from setuptools import setup
+    from torch.utils import cpp_extension
+
+    sources = ["src/dummy.cpp"]
+    include_dirs = [f"{os.path.dirname(os.path.abspath(__file__))}/include/"]
+
+    if torch.cuda.is_available():
+        module = cpp_extension.CUDAExtension(
+            name = "dummy_collectives",
+            sources = sources,
+            include_dirs = include_dirs,
+        )
+    else:
+        module = cpp_extension.CppExtension(
+            name = "dummy_collectives",
+            sources = sources,
+            include_dirs = include_dirs,
+        )
+
+    setup(
+        name = "Dummy-Collectives",
+        version = "0.0.1",
+        ext_modules = [module],
+        cmdclass={'build_ext': cpp_extension.BuildExtension}
+    )
+
+Step 4: Use The Extension in Application
+----------------------------------------
+
+After installation, you can conveniently use the ``dummy`` backend when calling
+`init_process_group <https://pytorch.org/docs/stable/distributed.html#torch.distributed.init_process_group>`__
+as if it is an builtin backend.
+
+We can specify dispatching based on backend by changing the ``backend`` argument of ``init_process_group``. We
+can dispatch collective with CPU tensor to ``gloo`` backend and dispatch collective with CUDA tensor to ``dummy`` backend by
+specifying ``cpu:gloo,cuda:dummy`` as the backend argument.
+
+To send all tensors to ``dummy`` backend, we can simply specify ``dummy`` as the backend argument.
+
+.. code-block:: python
+
+    import os
+
+    import torch
+    # importing dummy_collectives makes torch.distributed recognize `dummy`
+    # as a valid backend.
+    import dummy_collectives
+
+    import torch.distributed as dist
+
+    os.environ['MASTER_ADDR'] = 'localhost'
+    os.environ['MASTER_PORT'] = '29500'
+
+    # Alternatively:
+    # dist.init_process_group("dummy", rank=0, world_size=1)
+    dist.init_process_group("cpu:gloo,cuda:dummy", rank=0, world_size=1)
+
+    # this goes through gloo
+    x = torch.ones(6)
+    dist.all_reduce(x)
+    print(f"cpu allreduce: {x}")
+
+    # this goes through dummy
+    if torch.cuda.is_available():
+        y = x.cuda()
+        dist.all_reduce(y)
+        print(f"cuda allreduce: {y}")
+
+        try:
+            dist.broadcast(y, 0)
+        except RuntimeError:
+            print("got RuntimeError when calling broadcast")
diff --git a/intermediate_source/pruning_tutorial.py b/intermediate_source/pruning_tutorial.py
new file mode 100644
index 00000000000..346200502d5
--- /dev/null
+++ b/intermediate_source/pruning_tutorial.py
@@ -0,0 +1,403 @@
+# -*- coding: utf-8 -*-
+"""
+Pruning Tutorial
+=====================================
+**Author**: `Michela Paganini <https://github.com/mickypaganini>`_
+
+State-of-the-art deep learning techniques rely on over-parametrized models 
+that are hard to deploy. On the contrary, biological neural networks are 
+known to use efficient sparse connectivity. Identifying optimal  
+techniques to compress models by reducing the number of parameters in them is 
+important in order to reduce memory, battery, and hardware consumption without 
+sacrificing accuracy. This in turn allows you to deploy lightweight models on device, and guarantee 
+privacy with private on-device computation. On the research front, pruning is 
+used to investigate the differences in learning dynamics between 
+over-parametrized and under-parametrized networks, to study the role of lucky 
+sparse subnetworks and initializations
+("`lottery tickets <https://arxiv.org/abs/1803.03635>`_") as a destructive 
+neural architecture search technique, and more.
+
+In this tutorial, you will learn how to use ``torch.nn.utils.prune`` to 
+sparsify your neural networks, and how to extend it to implement your 
+own custom pruning technique.
+
+Requirements
+------------
+``"torch>=1.4.0a0+8e8a5e0"``
+
+"""
+import torch
+from torch import nn
+import torch.nn.utils.prune as prune
+import torch.nn.functional as F
+
+######################################################################
+# Create a model
+# --------------
+#
+# In this tutorial, we use the `LeNet 
+# <http://yann.lecun.com/exdb/publis/pdf/lecun-98.pdf>`_ architecture from 
+# LeCun et al., 1998.
+
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+class LeNet(nn.Module):
+    def __init__(self):
+        super(LeNet, self).__init__()
+        # 1 input image channel, 6 output channels, 5x5 square conv kernel
+        self.conv1 = nn.Conv2d(1, 6, 5)
+        self.conv2 = nn.Conv2d(6, 16, 5)
+        self.fc1 = nn.Linear(16 * 5 * 5, 120)  # 5x5 image dimension
+        self.fc2 = nn.Linear(120, 84)
+        self.fc3 = nn.Linear(84, 10)
+
+    def forward(self, x):
+        x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
+        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
+        x = x.view(-1, int(x.nelement() / x.shape[0]))
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        x = self.fc3(x)
+        return x
+
+model = LeNet().to(device=device)
+
+
+######################################################################
+# Inspect a Module
+# ----------------
+# 
+# Let's inspect the (unpruned) ``conv1`` layer in our LeNet model. It will contain two 
+# parameters ``weight`` and ``bias``, and no buffers, for now.
+module = model.conv1
+print(list(module.named_parameters()))
+
+######################################################################
+print(list(module.named_buffers()))
+
+######################################################################
+# Pruning a Module
+# ----------------
+# 
+# To prune a module (in this example, the ``conv1`` layer of our LeNet 
+# architecture), first select a pruning technique among those available in 
+# ``torch.nn.utils.prune`` (or
+# `implement <#extending-torch-nn-utils-pruning-with-custom-pruning-functions>`_
+# your own by subclassing 
+# ``BasePruningMethod``). Then, specify the module and the name of the parameter to 
+# prune within that module. Finally, using the adequate keyword arguments 
+# required by the selected pruning technique, specify the pruning parameters.
+#
+# In this example, we will prune at random 30% of the connections in 
+# the parameter named ``weight`` in the ``conv1`` layer.
+# The module is passed as the first argument to the function; ``name`` 
+# identifies the parameter within that module using its string identifier; and 
+# ``amount`` indicates either the percentage of connections to prune (if it 
+# is a float between 0. and 1.), or the absolute number of connections to 
+# prune (if it is a non-negative integer).
+prune.random_unstructured(module, name="weight", amount=0.3) 
+
+######################################################################
+# Pruning acts by removing ``weight`` from the parameters and replacing it with 
+# a new parameter called ``weight_orig`` (i.e. appending ``"_orig"`` to the 
+# initial parameter ``name``). ``weight_orig`` stores the unpruned version of 
+# the tensor. The ``bias`` was not pruned, so it will remain intact.
+print(list(module.named_parameters()))
+
+######################################################################
+# The pruning mask generated by the pruning technique selected above is saved 
+# as a module buffer named ``weight_mask`` (i.e. appending ``"_mask"`` to the 
+# initial parameter ``name``).
+print(list(module.named_buffers()))
+
+######################################################################
+# For the forward pass to work without modification, the ``weight`` attribute 
+# needs to exist. The pruning techniques implemented in 
+# ``torch.nn.utils.prune`` compute the pruned version of the weight (by 
+# combining the mask with the original parameter) and store them in the 
+# attribute ``weight``. Note, this is no longer a parameter of the ``module``,
+# it is now simply an attribute.
+print(module.weight)
+
+######################################################################
+# Finally, pruning is applied prior to each forward pass using PyTorch's
+# ``forward_pre_hooks``. Specifically, when the ``module`` is pruned, as we 
+# have done here, it will acquire a ``forward_pre_hook`` for each parameter 
+# associated with it that gets pruned. In this case, since we have so far 
+# only pruned the original parameter named ``weight``, only one hook will be
+# present.
+print(module._forward_pre_hooks)
+
+######################################################################
+# For completeness, we can now prune the ``bias`` too, to see how the 
+# parameters, buffers, hooks, and attributes of the ``module`` change.
+# Just for the sake of trying out another pruning technique, here we prune the 
+# 3 smallest entries in the bias by L1 norm, as implemented in the 
+# ``l1_unstructured`` pruning function.
+prune.l1_unstructured(module, name="bias", amount=3)
+
+######################################################################
+# We now expect the named parameters to include both ``weight_orig`` (from 
+# before) and ``bias_orig``. The buffers will include ``weight_mask`` and 
+# ``bias_mask``. The pruned versions of the two tensors will exist as 
+# module attributes, and the module will now have two ``forward_pre_hooks``.
+print(list(module.named_parameters()))
+
+######################################################################
+print(list(module.named_buffers()))
+
+######################################################################
+print(module.bias)
+
+######################################################################
+print(module._forward_pre_hooks)
+
+######################################################################
+# Iterative Pruning
+# -----------------
+# 
+# The same parameter in a module can be pruned multiple times, with the 
+# effect of the various pruning calls being equal to the combination of the
+# various masks applied in series.
+# The combination of a new mask with the old mask is handled by the 
+# ``PruningContainer``'s ``compute_mask`` method.
+#
+# Say, for example, that we now want to further prune ``module.weight``, this
+# time using structured pruning along the 0th axis of the tensor (the 0th axis 
+# corresponds to the output channels of the convolutional layer and has 
+# dimensionality 6 for ``conv1``), based on the channels' L2 norm. This can be 
+# achieved using the ``ln_structured`` function, with ``n=2`` and ``dim=0``.
+prune.ln_structured(module, name="weight", amount=0.5, n=2, dim=0)
+
+# As we can verify, this will zero out all the connections corresponding to 
+# 50% (3 out of 6) of the channels, while preserving the action of the 
+# previous mask.
+print(module.weight)
+
+############################################################################
+# The corresponding hook will now be of type 
+# ``torch.nn.utils.prune.PruningContainer``, and will store the history of 
+# pruning applied to the ``weight`` parameter.
+for hook in module._forward_pre_hooks.values():
+    if hook._tensor_name == "weight":  # select out the correct hook
+        break
+
+print(list(hook))  # pruning history in the container 
+
+######################################################################
+# Serializing a pruned model
+# --------------------------
+# All relevant tensors, including the mask buffers and the original parameters
+# used to compute the pruned tensors are stored in the model's ``state_dict`` 
+# and can therefore be easily serialized and saved, if needed.
+print(model.state_dict().keys())
+
+
+######################################################################
+# Remove pruning re-parametrization
+# ---------------------------------
+#
+# To make the pruning permanent, remove the re-parametrization in terms
+# of ``weight_orig`` and ``weight_mask``, and remove the ``forward_pre_hook``,
+# we can use the ``remove`` functionality from ``torch.nn.utils.prune``.
+# Note that this doesn't undo the pruning, as if it never happened. It simply 
+# makes it permanent, instead, by reassigning the parameter ``weight`` to the 
+# model parameters, in its pruned version.
+
+######################################################################
+# Prior to removing the re-parametrization:
+print(list(module.named_parameters()))
+######################################################################
+print(list(module.named_buffers()))
+######################################################################
+print(module.weight)
+
+######################################################################
+# After removing the re-parametrization:
+prune.remove(module, 'weight')
+print(list(module.named_parameters()))
+######################################################################
+print(list(module.named_buffers()))
+
+######################################################################
+# Pruning multiple parameters in a model 
+# --------------------------------------
+#
+# By specifying the desired pruning technique and parameters, we can easily 
+# prune multiple tensors in a network, perhaps according to their type, as we 
+# will see in this example.
+
+new_model = LeNet()
+for name, module in new_model.named_modules():
+    # prune 20% of connections in all 2D-conv layers 
+    if isinstance(module, torch.nn.Conv2d):
+        prune.l1_unstructured(module, name='weight', amount=0.2)
+    # prune 40% of connections in all linear layers 
+    elif isinstance(module, torch.nn.Linear):
+        prune.l1_unstructured(module, name='weight', amount=0.4)
+
+print(dict(new_model.named_buffers()).keys())  # to verify that all masks exist
+
+######################################################################
+# Global pruning
+# --------------
+#
+# So far, we only looked at what is usually referred to as "local" pruning,
+# i.e. the practice of pruning tensors in a model one by one, by 
+# comparing the statistics (weight magnitude, activation, gradient, etc.) of 
+# each entry exclusively to the other entries in that tensor. However, a 
+# common and perhaps more powerful technique is to prune the model all at 
+# once, by removing (for example) the lowest 20% of connections across the 
+# whole model, instead of removing the lowest 20% of connections in each 
+# layer. This is likely to result in different pruning percentages per layer.
+# Let's see how to do that using ``global_unstructured`` from 
+# ``torch.nn.utils.prune``.
+
+model = LeNet()
+
+parameters_to_prune = (
+    (model.conv1, 'weight'),
+    (model.conv2, 'weight'),
+    (model.fc1, 'weight'),
+    (model.fc2, 'weight'),
+    (model.fc3, 'weight'),
+)
+
+prune.global_unstructured(
+    parameters_to_prune,
+    pruning_method=prune.L1Unstructured,
+    amount=0.2,
+)
+
+######################################################################
+# Now we can check the sparsity induced in every pruned parameter, which will 
+# not be equal to 20% in each layer. However, the global sparsity will be 
+# (approximately) 20%.
+print(
+    "Sparsity in conv1.weight: {:.2f}%".format(
+        100. * float(torch.sum(model.conv1.weight == 0))
+        / float(model.conv1.weight.nelement())
+    )
+)
+print(
+    "Sparsity in conv2.weight: {:.2f}%".format(
+        100. * float(torch.sum(model.conv2.weight == 0))
+        / float(model.conv2.weight.nelement())
+    )
+)
+print(
+    "Sparsity in fc1.weight: {:.2f}%".format(
+        100. * float(torch.sum(model.fc1.weight == 0))
+        / float(model.fc1.weight.nelement())
+    )
+)
+print(
+    "Sparsity in fc2.weight: {:.2f}%".format(
+        100. * float(torch.sum(model.fc2.weight == 0))
+        / float(model.fc2.weight.nelement())
+    )
+)
+print(
+    "Sparsity in fc3.weight: {:.2f}%".format(
+        100. * float(torch.sum(model.fc3.weight == 0))
+        / float(model.fc3.weight.nelement())
+    )
+)
+print(
+    "Global sparsity: {:.2f}%".format(
+        100. * float(
+            torch.sum(model.conv1.weight == 0)
+            + torch.sum(model.conv2.weight == 0)
+            + torch.sum(model.fc1.weight == 0)
+            + torch.sum(model.fc2.weight == 0)
+            + torch.sum(model.fc3.weight == 0)
+        )
+        / float(
+            model.conv1.weight.nelement()
+            + model.conv2.weight.nelement()
+            + model.fc1.weight.nelement()
+            + model.fc2.weight.nelement()
+            + model.fc3.weight.nelement()
+        )
+    )
+)
+
+
+######################################################################
+# Extending ``torch.nn.utils.prune`` with custom pruning functions
+# ------------------------------------------------------------------
+# To implement your own pruning function, you can extend the
+# ``nn.utils.prune`` module by subclassing the ``BasePruningMethod``
+# base class, the same way all other pruning methods do. The base class
+# implements the following methods for you: ``__call__``, ``apply_mask``,
+# ``apply``, ``prune``, and ``remove``. Beyond some special cases, you shouldn't
+# have to reimplement these methods for your new pruning technique.
+# You will, however, have to implement ``__init__`` (the constructor),
+# and ``compute_mask`` (the instructions on how to compute the mask
+# for the given tensor according to the logic of your pruning
+# technique). In addition, you will have to specify which type of
+# pruning this technique implements (supported options are ``global``,
+# ``structured``, and ``unstructured``). This is needed to determine
+# how to combine masks in the case in which pruning is applied
+# iteratively. In other words, when pruning a prepruned parameter,
+# the current pruning technique is expected to act on the unpruned
+# portion of the parameter. Specifying the ``PRUNING_TYPE`` will
+# enable the ``PruningContainer`` (which handles the iterative
+# application of pruning masks) to correctly identify the slice of the
+# parameter to prune.
+#
+# Let's assume, for example, that you want to implement a pruning
+# technique that prunes every other entry in a tensor (or -- if the
+# tensor has previously been pruned -- in the remaining unpruned
+# portion of the tensor). This will be of ``PRUNING_TYPE='unstructured'``
+# because it acts on individual connections in a layer and not on entire
+# units/channels (``'structured'``), or across different parameters
+# (``'global'``).
+
+class FooBarPruningMethod(prune.BasePruningMethod):
+    """Prune every other entry in a tensor
+    """
+    PRUNING_TYPE = 'unstructured'
+
+    def compute_mask(self, t, default_mask):
+        mask = default_mask.clone()
+        mask.view(-1)[::2] = 0 
+        return mask
+
+######################################################################
+# Now, to apply this to a parameter in an ``nn.Module``, you should
+# also provide a simple function that instantiates the method and
+# applies it.
+def foobar_unstructured(module, name):
+    """Prunes tensor corresponding to parameter called `name` in `module`
+    by removing every other entry in the tensors.
+    Modifies module in place (and also return the modified module) 
+    by:
+    1) adding a named buffer called `name+'_mask'` corresponding to the 
+    binary mask applied to the parameter `name` by the pruning method.
+    The parameter `name` is replaced by its pruned version, while the 
+    original (unpruned) parameter is stored in a new parameter named 
+    `name+'_orig'`.
+
+    Args:
+        module (nn.Module): module containing the tensor to prune
+        name (string): parameter name within `module` on which pruning
+                will act.
+
+    Returns:
+        module (nn.Module): modified (i.e. pruned) version of the input
+            module
+    
+    Examples:
+        >>> m = nn.Linear(3, 4)
+        >>> foobar_unstructured(m, name='bias')
+    """
+    FooBarPruningMethod.apply(module, name)
+    return module
+
+######################################################################
+# Let's try it out!
+model = LeNet()
+foobar_unstructured(model.fc3, name='bias')
+
+print(model.fc3.bias_mask)
diff --git a/intermediate_source/realtime_rpi.rst b/intermediate_source/realtime_rpi.rst
new file mode 100644
index 00000000000..2e00b68d2f1
--- /dev/null
+++ b/intermediate_source/realtime_rpi.rst
@@ -0,0 +1,356 @@
+Real Time Inference on Raspberry Pi 4 and 5 (40 fps!)
+=================================================
+**Author**: `Tristan Rice <https://github.com/d4l3k>`_
+
+PyTorch has out of the box support for Raspberry Pi 4 and 5. This tutorial will guide
+you on how to setup a Raspberry Pi for running PyTorch and run a MobileNet v2
+classification model in real time (30-40 fps) on the CPU.
+
+This was all tested with Raspberry Pi 4 Model B 4GB but should work with the 2GB
+variant as well as on the 3B with reduced performance.
+
+.. image:: https://user-images.githubusercontent.com/909104/153093710-bc736b6f-69d9-4a50-a3e8-9f2b2c9e04fd.gif
+
+Prerequisites
+---------------
+
+To follow this tutorial you'll need a Raspberry Pi 4 or 5, a camera for it and all
+the other standard accessories.
+
+* `Raspberry Pi 4 Model B 2GB+ <https://www.raspberrypi.com/products/raspberry-pi-4-model-b/>`_
+* `Raspberry Pi Camera Module <https://www.raspberrypi.com/products/camera-module-v2/>`_
+* Heat sinks and Fan (optional but recommended)
+* 5V 3A USB-C Power Supply
+* SD card (at least 8gb)
+* SD card read/writer
+
+
+Raspberry Pi Setup
+----------------------
+
+PyTorch only provides pip packages for Arm 64bit (aarch64) so you'll need to install a 64 bit version of the OS on your Raspberry Pi
+
+You'll need to install the `official rpi-imager <https://www.raspberrypi.com/software/>`_ to install Rasbperry Pi OS.
+
+**32-bit Raspberry Pi OS will not work.**
+
+.. image:: https://user-images.githubusercontent.com/909104/152866212-36ce29b1-aba6-4924-8ae6-0a283f1fca14.gif
+
+Installation will take at least a few minutes depending on your internet speed and sdcard speed. Once it's done it should look like:
+
+.. image:: https://user-images.githubusercontent.com/909104/152867425-c005cff0-5f3f-47f1-922d-e0bbb541cd25.png
+
+Time to put your sdcard in your Raspberry Pi, connect the camera and boot it up.
+
+.. image:: https://user-images.githubusercontent.com/909104/152869862-c239c980-b089-4bd5-84eb-0a1e5cf22df2.png
+
+
+Raspberry Pi 4 Config
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+If you're using a Raspberry Pi 4, you'll need some additional config changes. These changes are not required on Raspberry Pi 5.
+
+Once the OS boots and you complete the initial setup you'll need to edit the ``/boot/config.txt`` file to enable the camera.
+
+.. code:: toml
+
+    # This enables the extended features such as the camera.
+    start_x=1
+
+    # This needs to be at least 128M for the camera processing, if it's bigger you can just leave it as is.
+    gpu_mem=128
+
+And then reboot.
+
+Installing PyTorch and picamera2
+-------------------------------
+
+PyTorch and all the other libraries we need have ARM 64-bit/aarch64 variants so you can just install them via pip and have it work like any other Linux system.
+
+.. code:: shell
+
+    $ sudo apt install -y python3-picamera2 python3-libcamera
+    $ pip install torch torchvision --break-system-packages
+
+.. image:: https://user-images.githubusercontent.com/909104/152874260-95a7a8bd-0f9b-438a-9c0b-5b67729e233f.png
+
+
+We can now check that everything installed correctly:
+
+.. code:: shell
+
+  $ python -c "import torch; print(torch.__version__)"
+
+.. image:: https://user-images.githubusercontent.com/909104/152874271-d7057c2d-80fd-4761-aed4-df6c8b7aa99f.png
+
+
+Video Capture
+-------------------
+
+Test the camera is working first, by running ``libcamera-hello`` in a terminal.
+
+For video capture we're going to be using picamera2 to capture the video frames.
+
+The model we're using (MobileNetV2) takes in image sizes of ``224x224`` so we
+can request that directly from picamera2 at 36fps. We're targeting 30fps for the
+model but we request a slightly higher framerate than that so there's always
+enough frames.
+
+.. code:: python
+
+    from picamera2 import Picamera2
+
+    picam2 = Picamera2()
+
+    # print available sensor modes
+    print(picam2.sensor_modes)
+
+    config = picam2.create_still_configuration(main={
+        "size": (224, 224),
+        "format": "BGR888",
+    }, display="main")
+    picam2.configure(config)
+    picam2.set_controls({"FrameRate": 36})
+    picam2.start()
+
+To capture the frames we can call ``capture_image`` to return a ``PIL.Image``
+object that we can use with PyTorch.
+
+.. code:: python
+
+    # read frame
+    image = picam2.capture_image("main")
+
+    # show frame for testing
+    image.show()
+
+This data reading and processing takes about ``3.5 ms``.
+
+Image Preprocessing
+----------------------
+
+We need to take the frames and transform them into the format the model expects. This is the same processing as you would do on any machine with the standard torchvision transforms.
+
+.. code:: python
+
+    from torchvision import transforms
+
+    preprocess = transforms.Compose([
+        # convert the frame to a CHW torch tensor for training
+        transforms.ToTensor(),
+        # normalize the colors to the range that mobilenet_v2/3 expect
+        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+    ])
+    input_tensor = preprocess(image)
+    # The model can handle multiple images simultaneously so we need to add an
+    # empty dimension for the batch.
+    # [3, 224, 224] -> [1, 3, 224, 224]
+    input_batch = input_tensor.unsqueeze(0)
+
+Model Choices
+----------------
+
+There's a number of models you can choose from to use with different performance
+characteristics. Not all models provide a ``qnnpack`` pretrained variant so for
+testing purposes you should chose one that does but if you train and quantize
+your own model you can use any of them.
+
+We're using ``mobilenet_v2`` for this tutorial since it has good performance and
+accuracy.
+
+Raspberry Pi 4 Benchmark Results:
+
++--------------------+------+-----------------------+-----------------------+--------------------+
+| Model              | FPS  | Total Time (ms/frame) | Model Time (ms/frame) | qnnpack Pretrained |
++====================+======+=======================+=======================+====================+
+| mobilenet_v2       | 33.7 |                  29.7 |                  26.4 | True               |
++--------------------+------+-----------------------+-----------------------+--------------------+
+| mobilenet_v3_large | 29.3 |                  34.1 |                  30.7 | True               |
++--------------------+------+-----------------------+-----------------------+--------------------+
+| resnet18           |  9.2 |                 109.0 |                 100.3 | False              |
++--------------------+------+-----------------------+-----------------------+--------------------+
+| resnet50           |  4.3 |                 233.9 |                 225.2 | False              |
++--------------------+------+-----------------------+-----------------------+--------------------+
+| resnext101_32x8d   |  1.1 |                 892.5 |                 885.3 | False              |
++--------------------+------+-----------------------+-----------------------+--------------------+
+| inception_v3       |  4.9 |                 204.1 |                 195.5 | False              |
++--------------------+------+-----------------------+-----------------------+--------------------+
+| googlenet          |  7.4 |                 135.3 |                 132.0 | False              |
++--------------------+------+-----------------------+-----------------------+--------------------+
+| shufflenet_v2_x0_5 | 46.7 |                  21.4 |                  18.2 | False              |
++--------------------+------+-----------------------+-----------------------+--------------------+
+| shufflenet_v2_x1_0 | 24.4 |                  41.0 |                  37.7 | False              |
++--------------------+------+-----------------------+-----------------------+--------------------+
+| shufflenet_v2_x1_5 | 16.8 |                  59.6 |                  56.3 | False              |
++--------------------+------+-----------------------+-----------------------+--------------------+
+| shufflenet_v2_x2_0 | 11.6 |                  86.3 |                  82.7 | False              |
++--------------------+------+-----------------------+-----------------------+--------------------+
+
+MobileNetV2: Quantization and JIT
+-------------------------------------
+
+For optimal performance we want a model that's quantized and fused. Quantized
+means that it does the computation using int8 which is much more performant than
+the standard float32 math. Fused means that consecutive operations have been
+fused together into a more performant version where possible. Commonly things
+like activations (``ReLU``) can be merged into the layer before (``Conv2d``)
+during inference.
+
+The aarch64 version of pytorch requires using the ``qnnpack`` engine.
+
+.. code:: python
+
+    import torch
+    torch.backends.quantized.engine = 'qnnpack'
+
+For this example we'll use a prequantized and fused version of MobileNetV2 that's provided out of the box by torchvision.
+
+.. code:: python
+
+    from torchvision import models
+    net = models.quantization.mobilenet_v2(pretrained=True, quantize=True)
+
+We then want to jit the model to reduce Python overhead and fuse any ops. Jit gives us ~30fps instead of ~20fps without it.
+
+.. code:: python
+
+    net = torch.jit.script(net)
+
+Putting It Together
+------------------------
+
+We can now put all the pieces together and run it:
+
+.. code:: python
+
+    import time
+
+    import torch
+    from torchvision import models, transforms
+    from picamera2 import Picamera2
+
+    torch.backends.quantized.engine = 'qnnpack'
+
+    picam2 = Picamera2()
+
+    # print available sensor modes
+    print(picam2.sensor_modes)
+
+    config = picam2.create_still_configuration(main={
+        "size": (224, 224),
+        "format": "BGR888",
+    }, display="main")
+    picam2.configure(config)
+    picam2.set_controls({"FrameRate": 36})
+    picam2.start()
+
+    preprocess = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+    ])
+
+    net = models.quantization.mobilenet_v2(pretrained=True, quantize=True)
+    # jit model to take it from ~20fps to ~30fps
+    net = torch.jit.script(net)
+
+    started = time.time()
+    last_logged = time.time()
+    frame_count = 0
+
+    with torch.no_grad():
+        while True:
+            # read frame
+            image = picam2.capture_image("main")
+
+
+            # preprocess
+            input_tensor = preprocess(image)
+
+            # create a mini-batch as expected by the model
+            input_batch = input_tensor.unsqueeze(0)
+
+            # run model
+            output = net(input_batch)
+            # do something with output ...
+            print(output.argmax())
+
+            # log model performance
+            frame_count += 1
+            now = time.time()
+            if now - last_logged > 1:
+                print(f"{frame_count / (now-last_logged)} fps")
+                last_logged = now
+                frame_count = 0
+
+
+Running it shows that we're hovering at ~30 fps on a Raspberry Pi 4 and ~41 fps on a Raspberry Pi 5.
+
+.. image:: https://user-images.githubusercontent.com/909104/152892609-7d115705-3ec9-4f8d-beed-a51711503a32.png
+
+This is with all the default settings in Raspberry Pi OS. If you disabled the UI
+and all the other background services that are enabled by default it's more
+performant and stable.
+
+If we check ``htop`` we see that we have almost 100% utilization.
+
+.. image:: https://user-images.githubusercontent.com/909104/152892630-f094b84b-19ba-48f6-8632-1b954abc59c7.png
+
+To verify that it's working end to end we can compute the probabilities of the
+classes and
+`use the ImageNet class labels <https://gist.github.com/yrevar/942d3a0ac09ec9e5eb3a>`_
+to print the detections.
+
+.. code:: python
+
+    top = list(enumerate(output[0].softmax(dim=0)))
+    top.sort(key=lambda x: x[1], reverse=True)
+    for idx, val in top[:10]:
+        print(f"{val.item()*100:.2f}% {classes[idx]}")
+
+``mobilenet_v3_large`` running in real time:
+
+.. image:: https://user-images.githubusercontent.com/909104/153093710-bc736b6f-69d9-4a50-a3e8-9f2b2c9e04fd.gif
+
+
+Detecting an orange:
+
+.. image:: https://user-images.githubusercontent.com/909104/153092153-d9c08dfe-105b-408a-8e1e-295da8a78c19.jpg
+
+
+Detecting a mug:
+
+.. image:: https://user-images.githubusercontent.com/909104/153092155-4b90002f-a0f3-4267-8d70-e713e7b4d5a0.jpg
+
+
+Troubleshooting: Performance
+--------------------------------
+
+PyTorch by default will use all of the cores available. If you have anything
+running in the background on the Raspberry Pi it may cause contention with the
+model inference causing latency spikes. To alleviate this you can reduce the
+number of threads which will reduce the peak latency at a small performance
+penalty.
+
+.. code:: python
+
+  torch.set_num_threads(2)
+
+For ``shufflenet_v2_x1_5`` using ``2 threads`` instead of ``4 threads``
+increases best case latency to ``72 ms`` from ``60 ms`` but eliminates the
+latency spikes of ``128 ms``.
+
+Next Steps
+------------
+
+You can create your own model or fine tune an existing one. If you fine tune on
+one of the models from
+`torchvision.models.quantized
+<https://pytorch.org/vision/stable/models.html#quantized-models>`_
+most of the work to fuse and quantize has already been done for you so you can
+directly deploy with good performance on a Raspberry Pi.
+
+See more:
+
+* `Quantization <https://pytorch.org/docs/stable/quantization.html>`_ for more information on how to quantize and fuse your model.
+* `Transfer Learning Tutorial <https://pytorch.org/tutorials/beginner/transfer_learning_tutorial.html>`_
+  for how to use transfer learning to fine tune a pre-existing model to your dataset.
diff --git a/intermediate_source/reinforcement_ppo.py b/intermediate_source/reinforcement_ppo.py
new file mode 100644
index 00000000000..c707c0ebb19
--- /dev/null
+++ b/intermediate_source/reinforcement_ppo.py
@@ -0,0 +1,705 @@
+# -*- coding: utf-8 -*-
+"""
+Reinforcement Learning (PPO) with TorchRL Tutorial
+==================================================
+**Author**: `Vincent Moens <https://github.com/vmoens>`_
+
+This tutorial demonstrates how to use PyTorch and :py:mod:`torchrl` to train a parametric policy
+network to solve the Inverted Pendulum task from the `OpenAI-Gym/Farama-Gymnasium
+control library <https://github.com/Farama-Foundation/Gymnasium>`__.
+
+.. figure:: /_static/img/invpendulum.gif
+   :alt: Inverted pendulum
+
+   Inverted pendulum
+
+Key learnings:
+
+- How to create an environment in TorchRL, transform its outputs, and collect data from this environment;
+- How to make your classes talk to each other using :class:`~tensordict.TensorDict`;
+- The basics of building your training loop with TorchRL:
+
+  - How to compute the advantage signal for policy gradient methods;
+  - How to create a stochastic policy using a probabilistic neural network;
+  - How to create a dynamic replay buffer and sample from it without repetition.
+
+We will cover six crucial components of TorchRL:
+
+* `environments <https://docs.pytorch.org/rl/stable/reference/envs.html>`__
+* `transforms <https://docs.pytorch.org/rl/stable/reference/envs.html#transforms>`__
+* `models (policy and value function) <https://docs.pytorch.org/rl/stable/reference/modules.html>`__
+* `loss modules <https://docs.pytorch.org/rl/stable/reference/objectives.html>`__
+* `data collectors <https://docs.pytorch.org/rl/stable/reference/collectors.html>`__
+* `replay buffers <https://docs.pytorch.org/rl/stable/reference/data.html#replay-buffers>`__
+
+"""
+
+######################################################################
+# If you are running this in Google Colab, make sure you install the following dependencies:
+#
+# .. code-block:: bash
+#
+#    !pip3 install torchrl
+#    !pip3 install gym[mujoco]
+#    !pip3 install tqdm
+#
+# Proximal Policy Optimization (PPO) is a policy-gradient algorithm where a
+# batch of data is being collected and directly consumed to train the policy to maximise
+# the expected return given some proximality constraints. You can think of it
+# as a sophisticated version of `REINFORCE <https://link.springer.com/content/pdf/10.1007/BF00992696.pdf>`_,
+# the foundational policy-optimization algorithm. For more information, see the
+# `Proximal Policy Optimization Algorithms <https://arxiv.org/abs/1707.06347>`_ paper.
+#
+# PPO is usually regarded as a fast and efficient method for online, on-policy
+# reinforcement algorithm. TorchRL provides a loss-module that does all the work
+# for you, so that you can rely on this implementation and focus on solving your
+# problem rather than re-inventing the wheel every time you want to train a policy.
+#
+# For completeness, here is a brief overview of what the loss computes, even though
+# this is taken care of by our :class:`~torchrl.objectives.ClipPPOLoss` module—the algorithm works as follows:
+# 1. we will sample a batch of data by playing the
+# policy in the environment for a given number of steps.
+# 2. Then, we will perform a given number of optimization steps with random sub-samples of this batch using
+# a clipped version of the REINFORCE loss.
+# 3. The clipping will put a pessimistic bound on our loss: lower return estimates will
+# be favored compared to higher ones.
+# The precise formula of the loss is:
+#
+# .. math::
+#
+#     L(s,a,\theta_k,\theta) = \min\left(
+#     \frac{\pi_{\theta}(a|s)}{\pi_{\theta_k}(a|s)}  A^{\pi_{\theta_k}}(s,a), \;\;
+#     g(\epsilon, A^{\pi_{\theta_k}}(s,a))
+#     \right),
+#
+# There are two components in that loss: in the first part of the minimum operator,
+# we simply compute an importance-weighted version of the REINFORCE loss (for example, a
+# REINFORCE loss that we have corrected for the fact that the current policy
+# configuration lags the one that was used for the data collection).
+# The second part of that minimum operator is a similar loss where we have clipped
+# the ratios when they exceeded or were below a given pair of thresholds.
+#
+# This loss ensures that whether the advantage is positive or negative, policy
+# updates that would produce significant shifts from the previous configuration
+# are being discouraged.
+#
+# This tutorial is structured as follows:
+#
+# 1. First, we will define a set of hyperparameters we will be using for training.
+#
+# 2. Next, we will focus on creating our environment, or simulator, using TorchRL's
+#    wrappers and transforms.
+#
+# 3. Next, we will design the policy network and the value model,
+#    which is indispensable to the loss function. These modules will be used
+#    to configure our loss module.
+#
+# 4. Next, we will create the replay buffer and data loader.
+#
+# 5. Finally, we will run our training loop and analyze the results.
+#
+# Throughout this tutorial, we'll be using the :mod:`tensordict` library.
+# :class:`~tensordict.TensorDict` is the lingua franca of TorchRL: it helps us abstract
+# what a module reads and writes and care less about the specific data
+# description and more about the algorithm itself.
+#
+
+import warnings
+warnings.filterwarnings("ignore")
+from torch import multiprocessing
+
+# sphinx_gallery_start_ignore
+
+# TorchRL prefers spawn method, that restricts creation of  ``~torchrl.envs.ParallelEnv`` inside
+# `__main__` method call, but for the easy of reading the code switch to fork
+# which is also a default spawn method in Google's Colaboratory
+try:
+    multiprocessing.set_start_method("fork")
+except RuntimeError:
+    pass
+
+# sphinx_gallery_end_ignore
+
+from collections import defaultdict
+
+import matplotlib.pyplot as plt
+import torch
+from tensordict.nn import TensorDictModule
+from tensordict.nn.distributions import NormalParamExtractor
+from torch import nn
+from torchrl.collectors import SyncDataCollector
+from torchrl.data.replay_buffers import ReplayBuffer
+from torchrl.data.replay_buffers.samplers import SamplerWithoutReplacement
+from torchrl.data.replay_buffers.storages import LazyTensorStorage
+from torchrl.envs import (Compose, DoubleToFloat, ObservationNorm, StepCounter,
+                          TransformedEnv)
+from torchrl.envs.libs.gym import GymEnv
+from torchrl.envs.utils import check_env_specs, ExplorationType, set_exploration_type
+from torchrl.modules import ProbabilisticActor, TanhNormal, ValueOperator
+from torchrl.objectives import ClipPPOLoss
+from torchrl.objectives.value import GAE
+from tqdm import tqdm
+
+######################################################################
+# Define Hyperparameters
+# ----------------------
+#
+# We set the hyperparameters for our algorithm. Depending on the resources
+# available, one may choose to execute the policy on GPU or on another
+# device.
+# The ``frame_skip`` will control how for how many frames is a single
+# action being executed. The rest of the arguments that count frames
+# must be corrected for this value (since one environment step will
+# actually return ``frame_skip`` frames).
+#
+
+is_fork = multiprocessing.get_start_method() == "fork"
+device = (
+    torch.device(0)
+    if torch.cuda.is_available() and not is_fork
+    else torch.device("cpu")
+)
+num_cells = 256  # number of cells in each layer i.e. output dim.
+lr = 3e-4
+max_grad_norm = 1.0
+
+######################################################################
+# Data collection parameters
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# When collecting data, we will be able to choose how big each batch will be
+# by defining a ``frames_per_batch`` parameter. We will also define how many
+# frames (such as the number of interactions with the simulator) we will allow ourselves to
+# use. In general, the goal of an RL algorithm is to learn to solve the task
+# as fast as it can in terms of environment interactions: the lower the ``total_frames``
+# the better.
+#
+frames_per_batch = 1000
+# For a complete training, bring the number of frames up to 1M
+total_frames = 50_000
+
+######################################################################
+# PPO parameters
+# ~~~~~~~~~~~~~~
+#
+# At each data collection (or batch collection) we will run the optimization
+# over a certain number of *epochs*, each time consuming the entire data we just
+# acquired in a nested training loop. Here, the ``sub_batch_size`` is different from the
+# ``frames_per_batch`` here above: recall that we are working with a "batch of data"
+# coming from our collector, which size is defined by ``frames_per_batch``, and that
+# we will further split in smaller sub-batches during the inner training loop.
+# The size of these sub-batches is controlled by ``sub_batch_size``.
+#
+sub_batch_size = 64  # cardinality of the sub-samples gathered from the current data in the inner loop
+num_epochs = 10  # optimization steps per batch of data collected
+clip_epsilon = (
+    0.2  # clip value for PPO loss: see the equation in the intro for more context.
+)
+gamma = 0.99
+lmbda = 0.95
+entropy_eps = 1e-4
+
+######################################################################
+# Define an environment
+# ---------------------
+#
+# In RL, an *environment* is usually the way we refer to a simulator or a
+# control system. Various libraries provide simulation environments for reinforcement
+# learning, including Gymnasium (previously OpenAI Gym), DeepMind control suite, and
+# many others.
+# As a general library, TorchRL's goal is to provide an interchangeable interface
+# to a large panel of RL simulators, allowing you to easily swap one environment
+# with another. For example, creating a wrapped gym environment can be achieved with few characters:
+#
+
+base_env = GymEnv("InvertedDoublePendulum-v4", device=device)
+
+######################################################################
+# There are a few things to notice in this code: first, we created
+# the environment by calling the ``GymEnv`` wrapper. If extra keyword arguments
+# are passed, they will be transmitted to the ``gym.make`` method, hence covering
+# the most common environment construction commands.
+# Alternatively, one could also directly create a gym environment using ``gym.make(env_name, **kwargs)``
+# and wrap it in a `GymWrapper` class.
+#
+# Also the ``device`` argument: for gym, this only controls the device where
+# input action and observed states will be stored, but the execution will always
+# be done on CPU. The reason for this is simply that gym does not support on-device
+# execution, unless specified otherwise. For other libraries, we have control over
+# the execution device and, as much as we can, we try to stay consistent in terms of
+# storing and execution backends.
+#
+# Transforms
+# ~~~~~~~~~~
+#
+# We will append some transforms to our environments to prepare the data for
+# the policy. In Gym, this is usually achieved via wrappers. TorchRL takes a different
+# approach, more similar to other pytorch domain libraries, through the use of transforms.
+# To add transforms to an environment, one should simply wrap it in a :class:`~torchrl.envs.transforms.TransformedEnv`
+# instance and append the sequence of transforms to it. The transformed environment will inherit
+# the device and meta-data of the wrapped environment, and transform these depending on the sequence
+# of transforms it contains.
+#
+# Normalization
+# ~~~~~~~~~~~~~
+#
+# The first to encode is a normalization transform.
+# As a rule of thumbs, it is preferable to have data that loosely
+# match a unit Gaussian distribution: to obtain this, we will
+# run a certain number of random steps in the environment and compute
+# the summary statistics of these observations.
+#
+# We'll append two other transforms: the :class:`~torchrl.envs.transforms.DoubleToFloat` transform will
+# convert double entries to single-precision numbers, ready to be read by the
+# policy. The :class:`~torchrl.envs.transforms.StepCounter` transform will be used to count the steps before
+# the environment is terminated. We will use this measure as a supplementary measure
+# of performance.
+#
+# As we will see later, many of the TorchRL's classes rely on :class:`~tensordict.TensorDict`
+# to communicate. You could think of it as a python dictionary with some extra
+# tensor features. In practice, this means that many modules we will be working
+# with need to be told what key to read (``in_keys``) and what key to write
+# (``out_keys``) in the ``tensordict`` they will receive. Usually, if ``out_keys``
+# is omitted, it is assumed that the ``in_keys`` entries will be updated
+# in-place. For our transforms, the only entry we are interested in is referred
+# to as ``"observation"`` and our transform layers will be told to modify this
+# entry and this entry only:
+#
+
+env = TransformedEnv(
+    base_env,
+    Compose(
+        # normalize observations
+        ObservationNorm(in_keys=["observation"]),
+        DoubleToFloat(),
+        StepCounter(),
+    ),
+)
+
+######################################################################
+# As you may have noticed, we have created a normalization layer but we did not
+# set its normalization parameters. To do this, :class:`~torchrl.envs.transforms.ObservationNorm` can
+# automatically gather the summary statistics of our environment:
+#
+env.transform[0].init_stats(num_iter=1000, reduce_dim=0, cat_dim=0)
+
+######################################################################
+# The :class:`~torchrl.envs.transforms.ObservationNorm` transform has now been populated with a
+# location and a scale that will be used to normalize the data.
+#
+# Let us do a little sanity check for the shape of our summary stats:
+#
+print("normalization constant shape:", env.transform[0].loc.shape)
+
+######################################################################
+# An environment is not only defined by its simulator and transforms, but also
+# by a series of metadata that describe what can be expected during its
+# execution.
+# For efficiency purposes, TorchRL is quite stringent when it comes to
+# environment specs, but you can easily check that your environment specs are
+# adequate.
+# In our example, the :class:`~torchrl.envs.libs.gym.GymWrapper` and
+# :class:`~torchrl.envs.libs.gym.GymEnv` that inherits
+# from it already take care of setting the proper specs for your environment so
+# you should not have to care about this.
+#
+# Nevertheless, let's see a concrete example using our transformed
+# environment by looking at its specs.
+# There are three specs to look at: ``observation_spec`` which defines what
+# is to be expected when executing an action in the environment,
+# ``reward_spec`` which indicates the reward domain and finally the
+# ``input_spec`` (which contains the ``action_spec``) and which represents
+# everything an environment requires to execute a single step.
+#
+print("observation_spec:", env.observation_spec)
+print("reward_spec:", env.reward_spec)
+print("input_spec:", env.input_spec)
+print("action_spec (as defined by input_spec):", env.action_spec)
+
+######################################################################
+# the :func:`check_env_specs` function runs a small rollout and compares its output against the environment
+# specs. If no error is raised, we can be confident that the specs are properly defined:
+#
+check_env_specs(env)
+
+######################################################################
+# For fun, let's see what a simple random rollout looks like. You can
+# call `env.rollout(n_steps)` and get an overview of what the environment inputs
+# and outputs look like. Actions will automatically be drawn from the action spec
+# domain, so you don't need to care about designing a random sampler.
+#
+# Typically, at each step, an RL environment receives an
+# action as input, and outputs an observation, a reward and a done state. The
+# observation may be composite, meaning that it could be composed of more than one
+# tensor. This is not a problem for TorchRL, since the whole set of observations
+# is automatically packed in the output :class:`~tensordict.TensorDict`. After executing a rollout
+# (for example, a sequence of environment steps and random action generations) over a given
+# number of steps, we will retrieve a :class:`~tensordict.TensorDict` instance with a shape
+# that matches this trajectory length:
+#
+rollout = env.rollout(3)
+print("rollout of three steps:", rollout)
+print("Shape of the rollout TensorDict:", rollout.batch_size)
+
+######################################################################
+# Our rollout data has a shape of ``torch.Size([3])``, which matches the number of steps
+# we ran it for. The ``"next"`` entry points to the data coming after the current step.
+# In most cases, the ``"next"`` data at time `t` matches the data at ``t+1``, but this
+# may not be the case if we are using some specific transformations (for example, multi-step).
+#
+# Policy
+# ------
+#
+# PPO utilizes a stochastic policy to handle exploration. This means that our
+# neural network will have to output the parameters of a distribution, rather
+# than a single value corresponding to the action taken.
+#
+# As the data is continuous, we use a Tanh-Normal distribution to respect the
+# action space boundaries. TorchRL provides such distribution, and the only
+# thing we need to care about is to build a neural network that outputs the
+# right number of parameters for the policy to work with (a location, or mean,
+# and a scale):
+#
+# .. math::
+#
+#     f_{\theta}(\text{observation}) = \mu_{\theta}(\text{observation}), \sigma^{+}_{\theta}(\text{observation})
+#
+# The only extra-difficulty that is brought up here is to split our output in two
+# equal parts and map the second to a strictly positive space.
+#
+# We design the policy in three steps:
+#
+# 1. Define a neural network ``D_obs`` -> ``2 * D_action``. Indeed, our ``loc`` (mu) and ``scale`` (sigma) both have dimension ``D_action``.
+#
+# 2. Append a :class:`~tensordict.nn.distributions.NormalParamExtractor` to extract a location and a scale (for example, splits the input in two equal parts and applies a positive transformation to the scale parameter).
+#
+# 3. Create a probabilistic :class:`~tensordict.nn.TensorDictModule` that can generate this distribution and sample from it.
+#
+
+actor_net = nn.Sequential(
+    nn.LazyLinear(num_cells, device=device),
+    nn.Tanh(),
+    nn.LazyLinear(num_cells, device=device),
+    nn.Tanh(),
+    nn.LazyLinear(num_cells, device=device),
+    nn.Tanh(),
+    nn.LazyLinear(2 * env.action_spec.shape[-1], device=device),
+    NormalParamExtractor(),
+)
+
+######################################################################
+# To enable the policy to "talk" with the environment through the ``tensordict``
+# data carrier, we wrap the ``nn.Module`` in a :class:`~tensordict.nn.TensorDictModule`. This
+# class will simply ready the ``in_keys`` it is provided with and write the
+# outputs in-place at the registered ``out_keys``.
+#
+policy_module = TensorDictModule(
+    actor_net, in_keys=["observation"], out_keys=["loc", "scale"]
+)
+
+######################################################################
+# We now need to build a distribution out of the location and scale of our
+# normal distribution. To do so, we instruct the
+# :class:`~torchrl.modules.tensordict_module.ProbabilisticActor`
+# class to build a :class:`~torchrl.modules.TanhNormal` out of the location and scale
+# parameters. We also provide the minimum and maximum values of this
+# distribution, which we gather from the environment specs.
+#
+# The name of the ``in_keys`` (and hence the name of the ``out_keys`` from
+# the :class:`~tensordict.nn.TensorDictModule` above) cannot be set to any value one may
+# like, as the :class:`~torchrl.modules.TanhNormal` distribution constructor will expect the
+# ``loc`` and ``scale`` keyword arguments. That being said,
+# :class:`~torchrl.modules.tensordict_module.ProbabilisticActor` also accepts
+# ``Dict[str, str]`` typed ``in_keys`` where the key-value pair indicates
+# what ``in_key`` string should be used for every keyword argument that is to be used.
+#
+policy_module = ProbabilisticActor(
+    module=policy_module,
+    spec=env.action_spec,
+    in_keys=["loc", "scale"],
+    distribution_class=TanhNormal,
+    distribution_kwargs={
+        "low": env.action_spec.space.low,
+        "high": env.action_spec.space.high,
+    },
+    return_log_prob=True,
+    # we'll need the log-prob for the numerator of the importance weights
+)
+
+######################################################################
+# Value network
+# -------------
+#
+# The value network is a crucial component of the PPO algorithm, even though it
+# won't be used at inference time. This module will read the observations and
+# return an estimation of the discounted return for the following trajectory.
+# This allows us to amortize learning by relying on the some utility estimation
+# that is learned on-the-fly during training. Our value network share the same
+# structure as the policy, but for simplicity we assign it its own set of
+# parameters.
+#
+value_net = nn.Sequential(
+    nn.LazyLinear(num_cells, device=device),
+    nn.Tanh(),
+    nn.LazyLinear(num_cells, device=device),
+    nn.Tanh(),
+    nn.LazyLinear(num_cells, device=device),
+    nn.Tanh(),
+    nn.LazyLinear(1, device=device),
+)
+
+value_module = ValueOperator(
+    module=value_net,
+    in_keys=["observation"],
+)
+
+######################################################################
+# let's try our policy and value modules. As we said earlier, the usage of
+# :class:`~tensordict.nn.TensorDictModule` makes it possible to directly read the output
+# of the environment to run these modules, as they know what information to read
+# and where to write it:
+#
+print("Running policy:", policy_module(env.reset()))
+print("Running value:", value_module(env.reset()))
+
+######################################################################
+# Data collector
+# --------------
+#
+# TorchRL provides a set of `DataCollector classes <https://docs.pytorch.org/rl/stable/reference/collectors.html>`__.
+# Briefly, these classes execute three operations: reset an environment,
+# compute an action given the latest observation, execute a step in the environment,
+# and repeat the last two steps until the environment signals a stop (or reaches
+# a done state).
+#
+# They allow you to control how many frames to collect at each iteration
+# (through the ``frames_per_batch`` parameter),
+# when to reset the environment (through the ``max_frames_per_traj`` argument),
+# on which ``device`` the policy should be executed, etc. They are also
+# designed to work efficiently with batched and multiprocessed environments.
+#
+# The simplest data collector is the :class:`~torchrl.collectors.collectors.SyncDataCollector`:
+# it is an iterator that you can use to get batches of data of a given length, and
+# that will stop once a total number of frames (``total_frames``) have been
+# collected.
+# Other data collectors (:class:`~torchrl.collectors.collectors.MultiSyncDataCollector` and
+# :class:`~torchrl.collectors.collectors.MultiaSyncDataCollector`) will execute
+# the same operations in synchronous and asynchronous manner over a
+# set of multiprocessed workers.
+#
+# As for the policy and environment before, the data collector will return
+# :class:`~tensordict.TensorDict` instances with a total number of elements that will
+# match ``frames_per_batch``. Using :class:`~tensordict.TensorDict` to pass data to the
+# training loop allows you to write data loading pipelines
+# that are 100% oblivious to the actual specificities of the rollout content.
+#
+collector = SyncDataCollector(
+    env,
+    policy_module,
+    frames_per_batch=frames_per_batch,
+    total_frames=total_frames,
+    split_trajs=False,
+    device=device,
+)
+
+######################################################################
+# Replay buffer
+# -------------
+#
+# Replay buffers are a common building piece of off-policy RL algorithms.
+# In on-policy contexts, a replay buffer is refilled every time a batch of
+# data is collected, and its data is repeatedly consumed for a certain number
+# of epochs.
+#
+# TorchRL's replay buffers are built using a common container
+# :class:`~torchrl.data.ReplayBuffer` which takes as argument the components
+# of the buffer: a storage, a writer, a sampler and possibly some transforms.
+# Only the storage (which indicates the replay buffer capacity) is mandatory.
+# We also specify a sampler without repetition to avoid sampling multiple times
+# the same item in one epoch.
+# Using a replay buffer for PPO is not mandatory and we could simply
+# sample the sub-batches from the collected batch, but using these classes
+# make it easy for us to build the inner training loop in a reproducible way.
+#
+
+replay_buffer = ReplayBuffer(
+    storage=LazyTensorStorage(max_size=frames_per_batch),
+    sampler=SamplerWithoutReplacement(),
+)
+
+######################################################################
+# Loss function
+# -------------
+#
+# The PPO loss can be directly imported from TorchRL for convenience using the
+# :class:`~torchrl.objectives.ClipPPOLoss` class. This is the easiest way of utilizing PPO:
+# it hides away the mathematical operations of PPO and the control flow that
+# goes with it.
+#
+# PPO requires some "advantage estimation" to be computed. In short, an advantage
+# is a value that reflects an expectancy over the return value while dealing with
+# the bias / variance tradeoff.
+# To compute the advantage, one just needs to (1) build the advantage module, which
+# utilizes our value operator, and (2) pass each batch of data through it before each
+# epoch.
+# The GAE module will update the input ``tensordict`` with new ``"advantage"`` and
+# ``"value_target"`` entries.
+# The ``"value_target"`` is a gradient-free tensor that represents the empirical
+# value that the value network should represent with the input observation.
+# Both of these will be used by :class:`~torchrl.objectives.ClipPPOLoss` to
+# return the policy and value losses.
+#
+
+advantage_module = GAE(
+    gamma=gamma, lmbda=lmbda, value_network=value_module, average_gae=True, device=device,
+)
+
+loss_module = ClipPPOLoss(
+    actor_network=policy_module,
+    critic_network=value_module,
+    clip_epsilon=clip_epsilon,
+    entropy_bonus=bool(entropy_eps),
+    entropy_coef=entropy_eps,
+    # these keys match by default but we set this for completeness
+    critic_coef=1.0,
+    loss_critic_type="smooth_l1",
+)
+
+optim = torch.optim.Adam(loss_module.parameters(), lr)
+scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
+    optim, total_frames // frames_per_batch, 0.0
+)
+
+######################################################################
+# Training loop
+# -------------
+# We now have all the pieces needed to code our training loop.
+# The steps include:
+#
+# * Collect data
+#
+#   * Compute advantage
+#
+#     * Loop over the collected to compute loss values
+#     * Back propagate
+#     * Optimize
+#     * Repeat
+#
+#   * Repeat
+#
+# * Repeat
+#
+
+
+logs = defaultdict(list)
+pbar = tqdm(total=total_frames)
+eval_str = ""
+
+# We iterate over the collector until it reaches the total number of frames it was
+# designed to collect:
+for i, tensordict_data in enumerate(collector):
+    # we now have a batch of data to work with. Let's learn something from it.
+    for _ in range(num_epochs):
+        # We'll need an "advantage" signal to make PPO work.
+        # We re-compute it at each epoch as its value depends on the value
+        # network which is updated in the inner loop.
+        advantage_module(tensordict_data)
+        data_view = tensordict_data.reshape(-1)
+        replay_buffer.extend(data_view.cpu())
+        for _ in range(frames_per_batch // sub_batch_size):
+            subdata = replay_buffer.sample(sub_batch_size)
+            loss_vals = loss_module(subdata.to(device))
+            loss_value = (
+                loss_vals["loss_objective"]
+                + loss_vals["loss_critic"]
+                + loss_vals["loss_entropy"]
+            )
+
+            # Optimization: backward, grad clipping and optimization step
+            loss_value.backward()
+            # this is not strictly mandatory but it's good practice to keep
+            # your gradient norm bounded
+            torch.nn.utils.clip_grad_norm_(loss_module.parameters(), max_grad_norm)
+            optim.step()
+            optim.zero_grad()
+
+    logs["reward"].append(tensordict_data["next", "reward"].mean().item())
+    pbar.update(tensordict_data.numel())
+    cum_reward_str = (
+        f"average reward={logs['reward'][-1]: 4.4f} (init={logs['reward'][0]: 4.4f})"
+    )
+    logs["step_count"].append(tensordict_data["step_count"].max().item())
+    stepcount_str = f"step count (max): {logs['step_count'][-1]}"
+    logs["lr"].append(optim.param_groups[0]["lr"])
+    lr_str = f"lr policy: {logs['lr'][-1]: 4.4f}"
+    if i % 10 == 0:
+        # We evaluate the policy once every 10 batches of data.
+        # Evaluation is rather simple: execute the policy without exploration
+        # (take the expected value of the action distribution) for a given
+        # number of steps (1000, which is our ``env`` horizon).
+        # The ``rollout`` method of the ``env`` can take a policy as argument:
+        # it will then execute this policy at each step.
+        with set_exploration_type(ExplorationType.DETERMINISTIC), torch.no_grad():
+            # execute a rollout with the trained policy
+            eval_rollout = env.rollout(1000, policy_module)
+            logs["eval reward"].append(eval_rollout["next", "reward"].mean().item())
+            logs["eval reward (sum)"].append(
+                eval_rollout["next", "reward"].sum().item()
+            )
+            logs["eval step_count"].append(eval_rollout["step_count"].max().item())
+            eval_str = (
+                f"eval cumulative reward: {logs['eval reward (sum)'][-1]: 4.4f} "
+                f"(init: {logs['eval reward (sum)'][0]: 4.4f}), "
+                f"eval step-count: {logs['eval step_count'][-1]}"
+            )
+            del eval_rollout
+    pbar.set_description(", ".join([eval_str, cum_reward_str, stepcount_str, lr_str]))
+
+    # We're also using a learning rate scheduler. Like the gradient clipping,
+    # this is a nice-to-have but nothing necessary for PPO to work.
+    scheduler.step()
+
+######################################################################
+# Results
+# -------
+#
+# Before the 1M step cap is reached, the algorithm should have reached a max
+# step count of 1000 steps, which is the maximum number of steps before the
+# trajectory is truncated.
+#
+plt.figure(figsize=(10, 10))
+plt.subplot(2, 2, 1)
+plt.plot(logs["reward"])
+plt.title("training rewards (average)")
+plt.subplot(2, 2, 2)
+plt.plot(logs["step_count"])
+plt.title("Max step count (training)")
+plt.subplot(2, 2, 3)
+plt.plot(logs["eval reward (sum)"])
+plt.title("Return (test)")
+plt.subplot(2, 2, 4)
+plt.plot(logs["eval step_count"])
+plt.title("Max step count (test)")
+plt.show()
+
+######################################################################
+# Conclusion and next steps
+# -------------------------
+#
+# In this tutorial, we have learned:
+#
+# 1. How to create and customize an environment with :py:mod:`torchrl`;
+# 2. How to write a model and a loss function;
+# 3. How to set up a typical training loop.
+#
+# If you want to experiment with this tutorial a bit more, you can apply the following modifications:
+#
+# * From an efficiency perspective,
+#   we could run several simulations in parallel to speed up data collection.
+#   Check :class:`~torchrl.envs.ParallelEnv` for further information.
+#
+# * From a logging perspective, one could add a :class:`torchrl.record.VideoRecorder` transform to
+#   the environment after asking for rendering to get a visual rendering of the
+#   inverted pendulum in action. Check :py:mod:`torchrl.record` to
+#   know more.
+#
diff --git a/intermediate_source/reinforcement_q_learning.py b/intermediate_source/reinforcement_q_learning.py
index b22b65036aa..1e50fcb3673 100644
--- a/intermediate_source/reinforcement_q_learning.py
+++ b/intermediate_source/reinforcement_q_learning.py
@@ -1,79 +1,81 @@
 # -*- coding: utf-8 -*-
 """
-Reinforcement Learning (DQN) tutorial
+Reinforcement Learning (DQN) Tutorial
 =====================================
 **Author**: `Adam Paszke <https://github.com/apaszke>`_
+            `Mark Towers <https://github.com/pseudo-rnd-thoughts>`_
 
 
 This tutorial shows how to use PyTorch to train a Deep Q Learning (DQN) agent
-on the CartPole-v0 task from the `OpenAI Gym <https://gym.openai.com/>`__.
+on the CartPole-v1 task from `Gymnasium <https://gymnasium.farama.org>`__.
+
+You might find it helpful to read the original `Deep Q Learning (DQN) <https://arxiv.org/abs/1312.5602>`__ paper
 
 **Task**
 
 The agent has to decide between two actions - moving the cart left or
-right - so that the pole attached to it stays upright. You can find an
-official leaderboard with various algorithms and visualizations at the
-`Gym website <https://gym.openai.com/envs/CartPole-v0>`__.
+right - so that the pole attached to it stays upright. You can find more
+information about the environment and other more challenging environments at
+`Gymnasium's website <https://gymnasium.farama.org/environments/classic_control/cart_pole/>`__.
 
 .. figure:: /_static/img/cartpole.gif
-   :alt: cartpole
+   :alt: CartPole
 
-   cartpole
+   CartPole
 
 As the agent observes the current state of the environment and chooses
 an action, the environment *transitions* to a new state, and also
 returns a reward that indicates the consequences of the action. In this
-task, the environment terminates if the pole falls over too far.
+task, rewards are +1 for every incremental timestep and the environment
+terminates if the pole falls over too far or the cart moves more than 2.4
+units away from center. This means better performing scenarios will run
+for longer duration, accumulating larger return.
 
 The CartPole task is designed so that the inputs to the agent are 4 real
 values representing the environment state (position, velocity, etc.).
-However, neural networks can solve the task purely by looking at the
-scene, so we'll use a patch of the screen centered on the cart as an
-input. Because of this, our results aren't directly comparable to the
-ones from the official leaderboard - our task is much harder.
-Unfortunately this does slow down the training, because we have to
-render all the frames.
+We take these 4 inputs without any scaling and pass them through a 
+small fully-connected network with 2 outputs, one for each action. 
+The network is trained to predict the expected value for each action, 
+given the input state. The action with the highest expected value is 
+then chosen.
 
-Strictly speaking, we will present the state as the difference between
-the current screen patch and the previous one. This will allow the agent
-to take the velocity of the pole into account from one image.
 
 **Packages**
 
 
 First, let's import needed packages. Firstly, we need
-`gym <https://gym.openai.com/docs>`__ for the environment
-(Install using `pip install gym`).
+`gymnasium <https://gymnasium.farama.org/>`__ for the environment,
+installed by using `pip`. This is a fork of the original OpenAI
+Gym project and maintained by the same team since Gym v0.19.
+If you are running this in Google Colab, run:
+
+.. code-block:: bash
+
+   %%bash
+   pip3 install gymnasium[classic_control]
+
 We'll also use the following from PyTorch:
 
 -  neural networks (``torch.nn``)
 -  optimization (``torch.optim``)
 -  automatic differentiation (``torch.autograd``)
--  utilities for vision tasks (``torchvision`` - `a separate
-   package <https://github.com/pytorch/vision>`__).
 
 """
 
-import gym
+import gymnasium as gym
 import math
 import random
-import numpy as np
 import matplotlib
 import matplotlib.pyplot as plt
-from collections import namedtuple
+from collections import namedtuple, deque
 from itertools import count
-from copy import deepcopy
-from PIL import Image
 
 import torch
 import torch.nn as nn
 import torch.optim as optim
 import torch.nn.functional as F
-from torch.autograd import Variable
-import torchvision.transforms as T
-
 
-env = gym.make('CartPole-v0').unwrapped
+env = gym.make("CartPole-v1")
 
 # set up matplotlib
 is_ipython = 'inline' in matplotlib.get_backend()
@@ -82,12 +84,30 @@
 
 plt.ion()
 
-# if gpu is to be used
-use_cuda = torch.cuda.is_available()
-FloatTensor = torch.cuda.FloatTensor if use_cuda else torch.FloatTensor
-LongTensor = torch.cuda.LongTensor if use_cuda else torch.LongTensor
-ByteTensor = torch.cuda.ByteTensor if use_cuda else torch.ByteTensor
-Tensor = FloatTensor
+# if GPU is to be used
+device = torch.device(
+    "cuda" if torch.cuda.is_available() else
+    "mps" if torch.backends.mps.is_available() else
+    "cpu"
+)
+
+
+# To ensure reproducibility during training, you can fix the random seeds
+# by uncommenting the lines below. This makes the results consistent across
+# runs, which is helpful for debugging or comparing different approaches.
+#
+# That said, allowing randomness can be beneficial in practice, as it lets
+# the model explore different training trajectories.
+
+
+# seed = 42
+# random.seed(seed)
+# torch.manual_seed(seed)
+# env.reset(seed=seed)
+# env.action_space.seed(seed)
+# env.observation_space.seed(seed)
+# if torch.cuda.is_available(): 
+#     torch.cuda.manual_seed(seed)
 
 
 ######################################################################
@@ -100,10 +120,12 @@
 # batch are decorrelated. It has been shown that this greatly stabilizes
 # and improves the DQN training procedure.
 #
-# For this, we're going to need two classses:
+# For this, we're going to need two classes:
 #
 # -  ``Transition`` - a named tuple representing a single transition in
-#    our environment
+#    our environment. It essentially maps (state, action) pairs
+#    to their (next_state, reward) result, with the state being the
+#    screen difference image as described later on.
 # -  ``ReplayMemory`` - a cyclic buffer of bounded size that holds the
 #    transitions observed recently. It also implements a ``.sample()``
 #    method for selecting a random batch of transitions for training.
@@ -116,16 +138,11 @@
 class ReplayMemory(object):
 
     def __init__(self, capacity):
-        self.capacity = capacity
-        self.memory = []
-        self.position = 0
+        self.memory = deque([], maxlen=capacity)
 
     def push(self, *args):
-        """Saves a transition."""
-        if len(self.memory) < self.capacity:
-            self.memory.append(None)
-        self.memory[self.position] = Transition(*args)
-        self.position = (self.position + 1) % self.capacity
+        """Save a transition"""
+        self.memory.append(Transition(*args))
 
     def sample(self, batch_size):
         return random.sample(self.memory, batch_size)
@@ -135,7 +152,7 @@ def __len__(self):
 
 
 ######################################################################
-# Now, let's define our model. But first, let quickly recap what a DQN is.
+# Now, let's define our model. But first, let's quickly recap what a DQN is.
 #
 # DQN algorithm
 # -------------
@@ -150,9 +167,11 @@ def __len__(self):
 # :math:`R_{t_0} = \sum_{t=t_0}^{\infty} \gamma^{t - t_0} r_t`, where
 # :math:`R_{t_0}` is also known as the *return*. The discount,
 # :math:`\gamma`, should be a constant between :math:`0` and :math:`1`
-# that ensures the sum converges. It makes rewards from the uncertain far
-# future less important for our agent than the ones in the near future
-# that it can be fairly confident about.
+# that ensures the sum converges. A lower :math:`\gamma` makes 
+# rewards from the uncertain far future less important for our agent 
+# than the ones in the near future that it can be fairly confident 
+# about. It also encourages agents to collect reward closer in time 
+# than equivalent rewards that are temporally far away in the future.
 #
 # The main idea behind Q-learning is that if we had a function
 # :math:`Q^*: State \times Action \rightarrow \mathbb{R}`, that could tell
@@ -175,9 +194,9 @@ def __len__(self):
 # The difference between the two sides of the equality is known as the
 # temporal difference error, :math:`\delta`:
 #
-# .. math:: \delta = Q(s, a) - (r + \gamma \max_a Q(s', a))
+# .. math:: \delta = Q(s, a) - (r + \gamma \max_a' Q(s', a))
 #
-# To minimise this error, we will use the `Huber
+# To minimize this error, we will use the `Huber
 # loss <https://en.wikipedia.org/wiki/Huber_loss>`__. The Huber loss acts
 # like the mean squared error when the error is small, but like the mean
 # absolute error when the error is large - this makes it more robust to
@@ -187,7 +206,7 @@ def __len__(self):
 #
 # .. math::
 #
-#    \mathcal{L} = \frac{1}{|B|}\sum_{(s, a, s', r) \ \in \ B} \mathcal{L}(\delta) 
+#    \mathcal{L} = \frac{1}{|B|}\sum_{(s, a, s', r) \ \in \ B} \mathcal{L}(\delta)
 #
 # .. math::
 #
@@ -199,86 +218,28 @@ def __len__(self):
 # Q-network
 # ^^^^^^^^^
 #
-# Our model will be a convolutional neural network that takes in the
+# Our model will be a feed forward  neural network that takes in the
 # difference between the current and previous screen patches. It has two
 # outputs, representing :math:`Q(s, \mathrm{left})` and
 # :math:`Q(s, \mathrm{right})` (where :math:`s` is the input to the
-# network). In effect, the network is trying to predict the *quality* of
+# network). In effect, the network is trying to predict the *expected return* of
 # taking each action given the current input.
 #
 
 class DQN(nn.Module):
 
-    def __init__(self):
+    def __init__(self, n_observations, n_actions):
         super(DQN, self).__init__()
-        self.conv1 = nn.Conv2d(3, 16, kernel_size=5, stride=2)
-        self.bn1 = nn.BatchNorm2d(16)
-        self.conv2 = nn.Conv2d(16, 32, kernel_size=5, stride=2)
-        self.bn2 = nn.BatchNorm2d(32)
-        self.conv3 = nn.Conv2d(32, 32, kernel_size=5, stride=2)
-        self.bn3 = nn.BatchNorm2d(32)
-        self.head = nn.Linear(448, 2)
+        self.layer1 = nn.Linear(n_observations, 128)
+        self.layer2 = nn.Linear(128, 128)
+        self.layer3 = nn.Linear(128, n_actions)
 
+    # Called with either one element to determine next action, or a batch
+    # during optimization. Returns tensor([[left0exp,right0exp]...]).
     def forward(self, x):
-        x = F.relu(self.bn1(self.conv1(x)))
-        x = F.relu(self.bn2(self.conv2(x)))
-        x = F.relu(self.bn3(self.conv3(x)))
-        return self.head(x.view(x.size(0), -1))
-
-
-######################################################################
-# Input extraction
-# ^^^^^^^^^^^^^^^^
-#
-# The code below are utilities for extracting and processing rendered
-# images from the environment. It uses the ``torchvision`` package, which
-# makes it easy to compose image transforms. Once you run the cell it will
-# display an example patch that it extracted.
-#
-
-resize = T.Compose([T.ToPILImage(),
-                    T.Scale(40, interpolation=Image.CUBIC),
-                    T.ToTensor()])
-
-# This is based on the code from gym.
-screen_width = 600
-
-
-def get_cart_location():
-    world_width = env.x_threshold * 2
-    scale = screen_width / world_width
-    return int(env.state[0] * scale + screen_width / 2.0)  # MIDDLE OF CART
-
-
-def get_screen():
-    screen = env.render(mode='rgb_array').transpose(
-        (2, 0, 1))  # transpose into torch order (CHW)
-    # Strip off the top and bottom of the screen
-    screen = screen[:, 160:320]
-    view_width = 320
-    cart_location = get_cart_location()
-    if cart_location < view_width // 2:
-        slice_range = slice(view_width)
-    elif cart_location > (screen_width - view_width // 2):
-        slice_range = slice(-view_width, None)
-    else:
-        slice_range = slice(cart_location - view_width // 2,
-                            cart_location + view_width // 2)
-    # Strip off the edges, so that we have a square image centered on a cart
-    screen = screen[:, :, slice_range]
-    # Convert to float, rescare, convert to torch tensor
-    # (this doesn't require a copy)
-    screen = np.ascontiguousarray(screen, dtype=np.float32) / 255
-    screen = torch.from_numpy(screen)
-    # Resize, and add a batch dimension (BCHW)
-    return resize(screen).unsqueeze(0).type(Tensor)
-
-env.reset()
-plt.figure()
-plt.imshow(get_screen().cpu().squeeze(0).permute(1, 2, 0).numpy(),
-           interpolation='none')
-plt.title('Example extracted screen')
-plt.show()
+        x = F.relu(self.layer1(x))
+        x = F.relu(self.layer2(x))
+        return self.layer3(x)
 
 
 ######################################################################
@@ -290,34 +251,47 @@ def get_screen():
 # This cell instantiates our model and its optimizer, and defines some
 # utilities:
 #
-# -  ``Variable`` - this is a simple wrapper around
-#    ``torch.autograd.Variable`` that will automatically send the data to
-#    the GPU every time we construct a Variable.
-# -  ``select_action`` - will select an action accordingly to an epsilon
+# -  ``select_action`` - will select an action according to an epsilon
 #    greedy policy. Simply put, we'll sometimes use our model for choosing
 #    the action, and sometimes we'll just sample one uniformly. The
 #    probability of choosing a random action will start at ``EPS_START``
 #    and will decay exponentially towards ``EPS_END``. ``EPS_DECAY``
 #    controls the rate of the decay.
-# -  ``plot_durations`` - a helper for plotting the durations of episodes,
+# -  ``plot_durations`` - a helper for plotting the duration of episodes,
 #    along with an average over the last 100 episodes (the measure used in
 #    the official evaluations). The plot will be underneath the cell
 #    containing the main training loop, and will update after every
 #    episode.
 #
 
+# BATCH_SIZE is the number of transitions sampled from the replay buffer
+# GAMMA is the discount factor as mentioned in the previous section
+# EPS_START is the starting value of epsilon
+# EPS_END is the final value of epsilon
+# EPS_DECAY controls the rate of exponential decay of epsilon, higher means a slower decay
+# TAU is the update rate of the target network
+# LR is the learning rate of the ``AdamW`` optimizer
+
 BATCH_SIZE = 128
-GAMMA = 0.999
+GAMMA = 0.99
 EPS_START = 0.9
-EPS_END = 0.05
-EPS_DECAY = 200
+EPS_END = 0.01
+EPS_DECAY = 2500
+TAU = 0.005
+LR = 3e-4
+
 
-model = DQN()
+# Get number of actions from gym action space
+n_actions = env.action_space.n
+# Get the number of state observations
+state, info = env.reset()
+n_observations = len(state)
 
-if use_cuda:
-    model.cuda()
+policy_net = DQN(n_observations, n_actions).to(device)
+target_net = DQN(n_observations, n_actions).to(device)
+target_net.load_state_dict(policy_net.state_dict())
 
-optimizer = optim.RMSprop(model.parameters())
+optimizer = optim.AdamW(policy_net.parameters(), lr=LR, amsgrad=True)
 memory = ReplayMemory(10000)
 
 
@@ -331,20 +305,26 @@ def select_action(state):
         math.exp(-1. * steps_done / EPS_DECAY)
     steps_done += 1
     if sample > eps_threshold:
-        return model(
-            Variable(state, volatile=True).type(FloatTensor)).data.max(1)[1]
+        with torch.no_grad():
+            # t.max(1) will return the largest column value of each row.
+            # second column on max result is index of where max element was
+            # found, so we pick action with the larger expected reward.
+            return policy_net(state).max(1).indices.view(1, 1)
     else:
-        return LongTensor([[random.randrange(2)]])
+        return torch.tensor([[env.action_space.sample()]], device=device, dtype=torch.long)
 
 
 episode_durations = []
 
 
-def plot_durations():
-    plt.figure(2)
-    plt.clf()
-    durations_t = torch.FloatTensor(episode_durations)
-    plt.title('Training...')
+def plot_durations(show_result=False):
+    plt.figure(1)
+    durations_t = torch.tensor(episode_durations, dtype=torch.float)
+    if show_result:
+        plt.title('Result')
+    else:
+        plt.clf()
+        plt.title('Training...')
     plt.xlabel('Episode')
     plt.ylabel('Duration')
     plt.plot(durations_t.numpy())
@@ -356,8 +336,11 @@ def plot_durations():
 
     plt.pause(0.001)  # pause a bit so that plots are updated
     if is_ipython:
-        display.clear_output(wait=True)
-        display.display(plt.gcf())
+        if not show_result:
+            display.display(plt.gcf())
+            display.clear_output(wait=True)
+        else:
+            display.display(plt.gcf())
 
 
 ######################################################################
@@ -370,91 +353,95 @@ def plot_durations():
 # single step of the optimization. It first samples a batch, concatenates
 # all the tensors into a single one, computes :math:`Q(s_t, a_t)` and
 # :math:`V(s_{t+1}) = \max_a Q(s_{t+1}, a)`, and combines them into our
-# loss. By defition we set :math:`V(s) = 0` if :math:`s` is a terminal
-# state.
-
-
-last_sync = 0
-
+# loss. By definition we set :math:`V(s) = 0` if :math:`s` is a terminal
+# state. We also use a target network to compute :math:`V(s_{t+1})` for
+# added stability. The target network is updated at every step with a 
+# `soft update <https://arxiv.org/pdf/1509.02971.pdf>`__ controlled by 
+# the hyperparameter ``TAU``, which was previously defined.
+#
 
 def optimize_model():
-    global last_sync
     if len(memory) < BATCH_SIZE:
         return
     transitions = memory.sample(BATCH_SIZE)
-    # Transpose the batch (see http://stackoverflow.com/a/19343/3343043 for
-    # detailed explanation).
+    # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
+    # detailed explanation). This converts batch-array of Transitions
+    # to Transition of batch-arrays.
     batch = Transition(*zip(*transitions))
 
     # Compute a mask of non-final states and concatenate the batch elements
-    non_final_mask = ByteTensor(tuple(map(lambda s: s is not None,
-                                          batch.next_state)))
-
-    # We don't want to backprop through the expected action values and volatile
-    # will save us on temporarily changing the model parameters'
-    # requires_grad to False!
-    non_final_next_states = Variable(torch.cat([s for s in batch.next_state
-                                                if s is not None]),
-                                     volatile=True)
-    state_batch = Variable(torch.cat(batch.state))
-    action_batch = Variable(torch.cat(batch.action))
-    reward_batch = Variable(torch.cat(batch.reward))
+    # (a final state would've been the one after which simulation ended)
+    non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
+                                          batch.next_state)), device=device, dtype=torch.bool)
+    non_final_next_states = torch.cat([s for s in batch.next_state
+                                                if s is not None])
+    state_batch = torch.cat(batch.state)
+    action_batch = torch.cat(batch.action)
+    reward_batch = torch.cat(batch.reward)
 
     # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
-    # columns of actions taken
-    state_action_values = model(state_batch).gather(1, action_batch)
+    # columns of actions taken. These are the actions which would've been taken
+    # for each batch state according to policy_net
+    state_action_values = policy_net(state_batch).gather(1, action_batch)
 
     # Compute V(s_{t+1}) for all next states.
-    next_state_values = Variable(torch.zeros(BATCH_SIZE).type(Tensor))
-    next_state_values[non_final_mask] = model(non_final_next_states).max(1)[0]
-    # Now, we don't want to mess up the loss with a volatile flag, so let's
-    # clear it. After this, we'll just end up with a Variable that has
-    # requires_grad=False
-    next_state_values.volatile = False
+    # Expected values of actions for non_final_next_states are computed based
+    # on the "older" target_net; selecting their best reward with max(1).values
+    # This is merged based on the mask, such that we'll have either the expected
+    # state value or 0 in case the state was final.
+    next_state_values = torch.zeros(BATCH_SIZE, device=device)
+    with torch.no_grad():
+        next_state_values[non_final_mask] = target_net(non_final_next_states).max(1).values
     # Compute the expected Q values
     expected_state_action_values = (next_state_values * GAMMA) + reward_batch
 
     # Compute Huber loss
-    loss = F.smooth_l1_loss(state_action_values, expected_state_action_values)
+    criterion = nn.SmoothL1Loss()
+    loss = criterion(state_action_values, expected_state_action_values.unsqueeze(1))
 
     # Optimize the model
     optimizer.zero_grad()
     loss.backward()
-    for param in model.parameters():
-        param.grad.data.clamp_(-1, 1)
+    # In-place gradient clipping
+    torch.nn.utils.clip_grad_value_(policy_net.parameters(), 100)
     optimizer.step()
 
+
 ######################################################################
 #
 # Below, you can find the main training loop. At the beginning we reset
-# the environment and initialize the ``state`` variable. Then, we sample
-# an action, execute it, observe the next screen and the reward (always
+# the environment and obtain the initial ``state`` Tensor. Then, we sample
+# an action, execute it, observe the next state and the reward (always
 # 1), and optimize our model once. When the episode ends (our model
 # fails), we restart the loop.
 #
-# Below, `num_episodes` is set small. You should download
-# the notebook and run lot more epsiodes.
+# Below, `num_episodes` is set to 600 if a GPU is available, otherwise 50 
+# episodes are scheduled so training does not take too long. However, 50 
+# episodes is insufficient for to observe good performance on CartPole.
+# You should see the model constantly achieve 500 steps within 600 training 
+# episodes. Training RL agents can be a noisy process, so restarting training
+# can produce better results if convergence is not observed.
+#
+
+if torch.cuda.is_available() or torch.backends.mps.is_available():
+    num_episodes = 600
+else:
+    num_episodes = 50
 
-num_episodes = 10
 for i_episode in range(num_episodes):
-    # Initialize the environment and state
-    env.reset()
-    last_screen = get_screen()
-    current_screen = get_screen()
-    state = current_screen - last_screen
+    # Initialize the environment and get its state
+    state, info = env.reset()
+    state = torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0)
     for t in count():
-        # Select and perform an action
         action = select_action(state)
-        _, reward, done, _ = env.step(action[0, 0])
-        reward = Tensor([reward])
-
-        # Observe new state
-        last_screen = current_screen
-        current_screen = get_screen()
-        if not done:
-            next_state = current_screen - last_screen
-        else:
+        observation, reward, terminated, truncated, _ = env.step(action.item())
+        reward = torch.tensor([reward], device=device)
+        done = terminated or truncated
+
+        if terminated:
             next_state = None
+        else:
+            next_state = torch.tensor(observation, dtype=torch.float32, device=device).unsqueeze(0)
 
         # Store the transition in memory
         memory.push(state, action, next_state, reward)
@@ -462,15 +449,36 @@ def optimize_model():
         # Move to the next state
         state = next_state
 
-        # Perform one step of the optimization (on the target network)
+        # Perform one step of the optimization (on the policy network)
         optimize_model()
+
+        # Soft update of the target network's weights
+        # θ′ ← τ θ + (1 −τ )θ′
+        target_net_state_dict = target_net.state_dict()
+        policy_net_state_dict = policy_net.state_dict()
+        for key in policy_net_state_dict:
+            target_net_state_dict[key] = policy_net_state_dict[key]*TAU + target_net_state_dict[key]*(1-TAU)
+        target_net.load_state_dict(target_net_state_dict)
+
         if done:
             episode_durations.append(t + 1)
             plot_durations()
             break
 
 print('Complete')
-env.render(close=True)
-env.close()
+plot_durations(show_result=True)
 plt.ioff()
 plt.show()
+
+######################################################################
+# Here is the diagram that illustrates the overall resulting data flow.
+#
+# .. figure:: /_static/img/reinforcement_learning_diagram.jpg
+#
+# Actions are chosen either randomly or based on a policy, getting the next
+# step sample from the gym environment. We record the results in the
+# replay memory and also run optimization step on every iteration.
+# Optimization picks a random batch from the replay memory to do training of the
+# new policy. The "older" target_net is also used in optimization to compute the
+# expected Q values. A soft update of its weights are performed at every step.
+#
diff --git a/intermediate_source/rpc_async_execution.rst b/intermediate_source/rpc_async_execution.rst
new file mode 100644
index 00000000000..4c7739104cc
--- /dev/null
+++ b/intermediate_source/rpc_async_execution.rst
@@ -0,0 +1,523 @@
+Implementing Batch RPC Processing Using Asynchronous Executions
+===============================================================
+**Author**: `Shen Li <https://mrshenli.github.io/>`_
+
+.. note::
+   |edit| View and edit this tutorial in `github <https://github.com/pytorch/tutorials/blob/main/intermediate_source/rpc_async_execution.rst>`__.
+
+Prerequisites:
+
+-  `PyTorch Distributed Overview <../beginner/dist_overview.html>`__
+-  `Getting started with Distributed RPC Framework <rpc_tutorial.html>`__
+-  `Implementing a Parameter Server using Distributed RPC Framework <rpc_param_server_tutorial.html>`__
+-  `RPC Asynchronous Execution Decorator <https://pytorch.org/docs/master/rpc.html#torch.distributed.rpc.functions.async_execution>`__
+
+This tutorial demonstrates how to build batch-processing RPC applications with
+the `@rpc.functions.async_execution <https://pytorch.org/docs/master/rpc.html#torch.distributed.rpc.functions.async_execution>`__
+decorator, which helps to speed up training by reducing the number of blocked
+RPC threads and consolidating CUDA operations on the callee.
+
+.. note:: This tutorial requires PyTorch v1.6.0 or above.
+
+Basics
+------
+
+Previous tutorials have shown the steps to build distributed training
+applications using `torch.distributed.rpc <https://pytorch.org/docs/stable/rpc.html>`__,
+but they didn't elaborate on what happens on the callee side when processing an
+RPC request. As of PyTorch v1.5, each RPC request will block one thread on the
+callee to execute the function in that request until that function returns.
+This works for many use cases, but there is one caveat. If the user function
+blocks on IO, e.g., with nested RPC invocation, or signaling, e.g., waiting for
+a different RPC request to unblock, the RPC thread on the callee will have to
+idle waiting until the IO finishes or the signaling event occurs. As a result,
+RPC callees are likely to use more threads than necessary. The cause of this
+problem is that RPC treats user functions as black boxes, and knows very little
+about what happens in the function. To allow user functions to yield and free
+RPC threads, more hints need to be provided to the RPC system.
+
+Since v1.6.0, PyTorch addresses this problem by introducing two new concepts:
+
+* A `torch.futures.Future <https://pytorch.org/docs/master/futures.html>`__ type
+  that encapsulates an asynchronous execution, which also supports installing
+  callback functions.
+* An `@rpc.functions.async_execution <https://pytorch.org/docs/master/rpc.html#torch.distributed.rpc.functions.async_execution>`__
+  decorator that allows applications to tell the callee that the target function
+  will return a future and can pause and yield multiple times during execution.
+
+With these two tools, the application code can break a user function into
+multiple smaller functions, chain them together as callbacks on ``Future``
+objects, and return the ``Future`` that contains the final result. On the callee
+side, when getting the ``Future`` object, it installs subsequent RPC response
+preparation and communication as callbacks as well, which will be triggered
+when the final result is ready. In this way, the callee no longer needs to block
+one thread and wait until the final return value is ready. Please refer to the
+API doc of
+`@rpc.functions.async_execution <https://pytorch.org/docs/master/rpc.html#torch.distributed.rpc.functions.async_execution>`__
+for simple examples.
+
+Besides reducing the number of idle threads on the callee, these tools also help
+to make batch RPC processing easier and faster. The following two sections of
+this tutorial demonstrate how to build distributed batch-updating parameter
+server and batch-processing reinforcement learning applications using the
+`@rpc.functions.async_execution <https://pytorch.org/docs/master/rpc.html#torch.distributed.rpc.functions.async_execution>`__
+decorator.
+
+Batch-Updating Parameter Server
+-------------------------------
+
+Consider a synchronized parameter server training application with one parameter
+server (PS) and multiple trainers. In this application, the PS holds the
+parameters and waits for all trainers to report gradients. In every iteration,
+it waits until receiving gradients from all trainers and then updates all
+parameters in one shot. The code below shows the implementation of the PS class.
+The ``update_and_fetch_model`` method is decorated using
+``@rpc.functions.async_execution`` and will be called by trainers. Each
+invocation returns a ``Future`` object that will be populated with the updated
+model. Invocations launched by most trainers just accumulate gradients to the
+``.grad`` field, return immediately, and yield the RPC thread on the PS. The
+last arriving trainer will trigger the optimizer step and consume all previously
+reported gradients. Then it sets the ``future_model`` with the updated model,
+which in turn notifies all previous requests from other trainers through the
+``Future`` object and sends out the updated model to all trainers.
+
+.. code:: python
+
+    import threading
+    import torchvision
+    import torch
+    import torch.distributed.rpc as rpc
+    from torch import optim
+
+    num_classes, batch_update_size = 30, 5
+
+    class BatchUpdateParameterServer(object):
+        def __init__(self, batch_update_size=batch_update_size):
+            self.model = torchvision.models.resnet50(num_classes=num_classes)
+            self.lock = threading.Lock()
+            self.future_model = torch.futures.Future()
+            self.batch_update_size = batch_update_size
+            self.curr_update_size = 0
+            self.optimizer = optim.SGD(self.model.parameters(), lr=0.001, momentum=0.9)
+            for p in self.model.parameters():
+                p.grad = torch.zeros_like(p)
+
+        def get_model(self):
+            return self.model
+
+        @staticmethod
+        @rpc.functions.async_execution
+        def update_and_fetch_model(ps_rref, grads):
+            # Using the RRef to retrieve the local PS instance
+            self = ps_rref.local_value()
+            with self.lock:
+                self.curr_update_size += 1
+                # accumulate gradients into .grad field
+                for p, g in zip(self.model.parameters(), grads):
+                    p.grad += g
+
+                # Save the current future_model and return it to make sure the
+                # returned Future object holds the correct model even if another
+                # thread modifies future_model before this thread returns.
+                fut = self.future_model
+
+                if self.curr_update_size >= self.batch_update_size:
+                    # update the model
+                    for p in self.model.parameters():
+                        p.grad /= self.batch_update_size
+                    self.curr_update_size = 0
+                    self.optimizer.step()
+                    self.optimizer.zero_grad()
+                    # by settiing the result on the Future object, all previous
+                    # requests expecting this updated model will be notified and
+                    # the their responses will be sent accordingly.
+                    fut.set_result(self.model)
+                    self.future_model = torch.futures.Future()
+
+            return fut
+
+For the trainers, they are all initialized using the same set of
+parameters from the PS. In every iteration, each trainer first runs the forward
+and the backward passes to generate gradients locally. Then, each trainer
+reports its gradients to the PS using RPC, and fetches back the updated
+parameters through the return value of the same RPC request. In the trainer's
+implementation, whether the target function is marked with
+``@rpc.functions.async_execution`` or not makes no difference. The
+trainer simply calls ``update_and_fetch_model`` using ``rpc_sync`` which will
+block on the trainer until the updated model is returned.
+
+.. code:: python
+
+    batch_size, image_w, image_h  = 20, 64, 64
+
+    class Trainer(object):
+        def __init__(self, ps_rref):
+            self.ps_rref, self.loss_fn = ps_rref, torch.nn.MSELoss()
+            self.one_hot_indices = torch.LongTensor(batch_size) \
+                                        .random_(0, num_classes) \
+                                        .view(batch_size, 1)
+
+        def get_next_batch(self):
+            for _ in range(6):
+                inputs = torch.randn(batch_size, 3, image_w, image_h)
+                labels = torch.zeros(batch_size, num_classes) \
+                            .scatter_(1, self.one_hot_indices, 1)
+                yield inputs.cuda(), labels.cuda()
+
+        def train(self):
+            name = rpc.get_worker_info().name
+            # get initial model parameters
+            m = self.ps_rref.rpc_sync().get_model().cuda()
+            # start training
+            for inputs, labels in self.get_next_batch():
+                self.loss_fn(m(inputs), labels).backward()
+                m = rpc.rpc_sync(
+                    self.ps_rref.owner(),
+                    BatchUpdateParameterServer.update_and_fetch_model,
+                    args=(self.ps_rref, [p.grad for p in m.cpu().parameters()]),
+                ).cuda()
+
+We skip the code that launches multiple processes in this tutorial and please
+refer to the `examples <https://github.com/pytorch/examples/tree/master/distributed/rpc>`__
+repo for the full implementation. Note that, it is possible to implement batch
+processing without the
+`@rpc.functions.async_execution <https://pytorch.org/docs/master/rpc.html#torch.distributed.rpc.functions.async_execution>`__
+decorator. However, that would require either blocking more RPC threads on
+the PS or use another round of RPC to fetch updated models, where the latter
+would add both more code complexity and more communication overhead.
+
+This section uses a simple parameter sever training example to show how to
+implement batch RPC applications using the
+`@rpc.functions.async_execution <https://pytorch.org/docs/master/rpc.html#torch.distributed.rpc.functions.async_execution>`__
+decorator. In the next section, we re-implement the reinforcement learning
+example in the previous
+`Getting started with Distributed RPC Framework <https://pytorch.org/tutorials/intermediate/rpc_tutorial.html>`__
+tutorial using batch processing, and demonstrate its impact on the training
+speed.
+
+Batch-Processing CartPole Solver
+--------------------------------
+
+This section uses CartPole-v1 from OpenAI Gym as
+an example to show the performance impact of batch processing RPC. Please note
+that since the goal is to demonstrate the usage of
+`@rpc.functions.async_execution <https://pytorch.org/docs/master/rpc.html#torch.distributed.rpc.functions.async_execution>`__
+instead of building the best CartPole solver or solving most different RL
+problems, we use very simple policies and reward calculation strategies and
+focus on the multi-observer single-agent batch RPC implementation. We use a
+similar ``Policy`` model as the previous tutorial which is shown below. Compared
+to the previous tutorial, the difference is that its constructor takes an
+additional ``batch`` argument which controls the ``dim`` parameter for
+``F.softmax`` because with batching, the ``x`` argument in the ``forward``
+function contains states from multiple observers and hence the dimension needs
+to change properly. Everything else stays intact.
+
+.. code:: python
+
+    import argparse
+    import torch.nn as nn
+    import torch.nn.functional as F
+
+    parser = argparse.ArgumentParser(description='PyTorch RPC Batch RL example')
+    parser.add_argument('--gamma', type=float, default=1.0, metavar='G',
+                        help='discount factor (default: 1.0)')
+    parser.add_argument('--seed', type=int, default=543, metavar='N',
+                        help='random seed (default: 543)')
+    parser.add_argument('--num-episode', type=int, default=10, metavar='E',
+                        help='number of episodes (default: 10)')
+    args = parser.parse_args()
+
+    torch.manual_seed(args.seed)
+
+    class Policy(nn.Module):
+        def __init__(self, batch=True):
+            super(Policy, self).__init__()
+            self.affine1 = nn.Linear(4, 128)
+            self.dropout = nn.Dropout(p=0.6)
+            self.affine2 = nn.Linear(128, 2)
+            self.dim = 2 if batch else 1
+
+        def forward(self, x):
+            x = self.affine1(x)
+            x = self.dropout(x)
+            x = F.relu(x)
+            action_scores = self.affine2(x)
+            return F.softmax(action_scores, dim=self.dim)
+
+
+The constructor of the ``Observer`` adjusts accordingly as well. It also takes a
+``batch`` argument, which governs which ``Agent`` function it uses to select
+actions. In batch mode, it calls ``select_action_batch`` function on ``Agent``
+which will be presented shortly, and this function will be decorated with
+`@rpc.functions.async_execution <https://pytorch.org/docs/master/rpc.html#torch.distributed.rpc.functions.async_execution>`__.
+
+
+.. code:: python
+
+    import gym
+    import torch.distributed.rpc as rpc
+
+    class Observer:
+        def __init__(self, batch=True):
+            self.id = rpc.get_worker_info().id - 1
+            self.env = gym.make('CartPole-v1')
+            self.env.seed(args.seed)
+            self.select_action = Agent.select_action_batch if batch else Agent.select_action
+
+Compared to the previous tutorial
+`Getting started with Distributed RPC Framework <https://pytorch.org/tutorials/intermediate/rpc_tutorial.html>`__,
+observers behave a little differently. Instead of exiting when the environment
+is stopped, it always runs ``n_steps`` iterations in every episode. When the
+environment returns, the observer simply resets the environment and start over
+again. With this design, the agent will receive a fixed number of states from
+every observer and hence can pack them into a fixed-size tensor. In every
+step, the ``Observer`` uses RPC to send its state to the ``Agent`` and fetches
+the action through the return value. At the end of every episode, it returns the
+rewards of all steps to ``Agent``. Note that this ``run_episode`` function will
+be called by the ``Agent`` using RPC. So the ``rpc_sync`` call in this function
+will be a nested RPC invocation. We could mark this function as ``@rpc.functions.async_execution``
+too to avoid blocking one thread on the ``Observer``. However, as the bottleneck
+is the ``Agent`` instead of the ``Observer``, it should be OK to block one
+thread on the ``Observer`` process.
+
+
+.. code:: python
+
+    import torch
+
+    class Observer:
+        ...
+
+        def run_episode(self, agent_rref, n_steps):
+            state, ep_reward = self.env.reset(), NUM_STEPS
+            rewards = torch.zeros(n_steps)
+            start_step = 0
+            for step in range(n_steps):
+                state = torch.from_numpy(state).float().unsqueeze(0)
+                # send the state to the agent to get an action
+                action = rpc.rpc_sync(
+                    agent_rref.owner(),
+                    self.select_action,
+                    args=(agent_rref, self.id, state)
+                )
+
+                # apply the action to the environment, and get the reward
+                state, reward, done, _ = self.env.step(action)
+                rewards[step] = reward
+
+                if done or step + 1 >= n_steps:
+                    curr_rewards = rewards[start_step:(step + 1)]
+                    R = 0
+                    for i in range(curr_rewards.numel() -1, -1, -1):
+                        R = curr_rewards[i] + args.gamma * R
+                        curr_rewards[i] = R
+                    state = self.env.reset()
+                    if start_step == 0:
+                        ep_reward = min(ep_reward, step - start_step + 1)
+                    start_step = step + 1
+
+            return [rewards, ep_reward]
+
+The constructor of the ``Agent`` also takes a ``batch`` argument, which controls
+how action probs are batched. In batch mode, the ``saved_log_probs`` contains a
+list of tensors, where each tensor contains action robs from all observers in
+one step. Without batching, the ``saved_log_probs`` is a dictionary where the
+key is the observer id and the value is a list of action probs for that
+observer.
+
+.. code:: python
+
+    import threading
+    from torch.distributed.rpc import RRef
+
+    class Agent:
+        def __init__(self, world_size, batch=True):
+            self.ob_rrefs = []
+            self.agent_rref = RRef(self)
+            self.rewards = {}
+            self.policy = Policy(batch).cuda()
+            self.optimizer = optim.Adam(self.policy.parameters(), lr=1e-2)
+            self.running_reward = 0
+
+            for ob_rank in range(1, world_size):
+                ob_info = rpc.get_worker_info(OBSERVER_NAME.format(ob_rank))
+                self.ob_rrefs.append(rpc.remote(ob_info, Observer, args=(batch,)))
+                self.rewards[ob_info.id] = []
+
+            self.states = torch.zeros(len(self.ob_rrefs), 1, 4)
+            self.batch = batch
+            self.saved_log_probs = [] if batch else {k:[] for k in range(len(self.ob_rrefs))}
+            self.future_actions = torch.futures.Future()
+            self.lock = threading.Lock()
+            self.pending_states = len(self.ob_rrefs)
+
+The non-batching ``select_acion`` simply runs the state throw the policy, saves
+the action prob, and returns the action to the observer right away.
+
+.. code:: python
+
+    from torch.distributions import Categorical
+
+    class Agent:
+        ...
+
+        @staticmethod
+        def select_action(agent_rref, ob_id, state):
+            self = agent_rref.local_value()
+            probs = self.policy(state.cuda())
+            m = Categorical(probs)
+            action = m.sample()
+            self.saved_log_probs[ob_id].append(m.log_prob(action))
+            return action.item()
+
+With batching, the state is stored in a 2D tensor ``self.states``, using the
+observer id as the row id. Then, it chains a ``Future`` by installing a callback
+function to the batch-generated ``self.future_actions`` ``Future`` object, which
+will be populated with the specific row indexed using the id of that observer.
+The last arriving observer runs all batched states through the policy in one
+shot and set  ``self.future_actions`` accordingly. When this occurs, all the
+callback functions installed on ``self.future_actions`` will be triggered and
+their return values will be used to populate the chained ``Future`` object,
+which in turn notifies the ``Agent`` to prepare and communicate responses for
+all previous RPC requests from other observers.
+
+.. code:: python
+
+    class Agent:
+        ...
+
+        @staticmethod
+        @rpc.functions.async_execution
+        def select_action_batch(agent_rref, ob_id, state):
+            self = agent_rref.local_value()
+            self.states[ob_id].copy_(state)
+            future_action = self.future_actions.then(
+                lambda future_actions: future_actions.wait()[ob_id].item()
+            )
+
+            with self.lock:
+                self.pending_states -= 1
+                if self.pending_states == 0:
+                    self.pending_states = len(self.ob_rrefs)
+                    probs = self.policy(self.states.cuda())
+                    m = Categorical(probs)
+                    actions = m.sample()
+                    self.saved_log_probs.append(m.log_prob(actions).t()[0])
+                    future_actions = self.future_actions
+                    self.future_actions = torch.futures.Future()
+                    future_actions.set_result(actions.cpu())
+            return future_action
+
+Now let's define how different RPC functions are stitched together. The ``Agent``
+controls the execution of every episode. It first uses ``rpc_async`` to kick off
+the episode on all observers and block on the returned futures which will be
+populated with observer rewards. Note that the code below uses the RRef helper
+``ob_rref.rpc_async()`` to launch the ``run_episode`` function on the owner
+of the ``ob_rref`` RRef with the provided arguments.
+It then converts the saved action probs and returned observer rewards into
+expected data format, and launch the training step. Finally, it resets all
+states and returns the reward of the current episode. This function is the entry
+point to run one episode.
+
+.. code:: python
+
+    class Agent:
+        ...
+
+        def run_episode(self, n_steps=0):
+            futs = []
+            for ob_rref in self.ob_rrefs:
+                # make async RPC to kick off an episode on all observers
+                futs.append(ob_rref.rpc_async().run_episode(self.agent_rref, n_steps))
+
+            # wait until all obervers have finished this episode
+            rets = torch.futures.wait_all(futs)
+            rewards = torch.stack([ret[0] for ret in rets]).cuda().t()
+            ep_rewards = sum([ret[1] for ret in rets]) / len(rets)
+
+            # stack saved probs into one tensor
+            if self.batch:
+                probs = torch.stack(self.saved_log_probs)
+            else:
+                probs = [torch.stack(self.saved_log_probs[i]) for i in range(len(rets))]
+                probs = torch.stack(probs)
+
+            policy_loss = -probs * rewards / len(rets)
+            policy_loss.sum().backward()
+            self.optimizer.step()
+            self.optimizer.zero_grad()
+
+            # reset variables
+            self.saved_log_probs = [] if self.batch else {k:[] for k in range(len(self.ob_rrefs))}
+            self.states = torch.zeros(len(self.ob_rrefs), 1, 4)
+
+            # calculate running rewards
+            self.running_reward = 0.5 * ep_rewards + 0.5 * self.running_reward
+            return ep_rewards, self.running_reward
+
+The rest of the code is normal processes launching and logging which are
+similar to other RPC tutorials. In this tutorial, all observers passively
+waiting for commands from the agent. Please refer to the
+`examples <https://github.com/pytorch/examples/tree/master/distributed/rpc>`__
+repo for the full implementation.
+
+.. code:: python
+
+    def run_worker(rank, world_size, n_episode, batch, print_log=True):
+        os.environ['MASTER_ADDR'] = 'localhost'
+        os.environ['MASTER_PORT'] = '29500'
+        if rank == 0:
+            # rank0 is the agent
+            rpc.init_rpc(AGENT_NAME, rank=rank, world_size=world_size)
+
+            agent = Agent(world_size, batch)
+            for i_episode in range(n_episode):
+                last_reward, running_reward = agent.run_episode(n_steps=NUM_STEPS)
+
+                if print_log:
+                    print('Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}'.format(
+                        i_episode, last_reward, running_reward))
+        else:
+            # other ranks are the observer
+            rpc.init_rpc(OBSERVER_NAME.format(rank), rank=rank, world_size=world_size)
+            # observers passively waiting for instructions from agents
+        rpc.shutdown()
+
+
+    def main():
+        for world_size in range(2, 12):
+            delays = []
+            for batch in [True, False]:
+                tik = time.time()
+                mp.spawn(
+                    run_worker,
+                    args=(world_size, args.num_episode, batch),
+                    nprocs=world_size,
+                    join=True
+                )
+                tok = time.time()
+                delays.append(tok - tik)
+
+            print(f"{world_size}, {delays[0]}, {delays[1]}")
+
+
+    if __name__ == '__main__':
+        main()
+
+Batch RPC helps to consolidate the action inference into less CUDA operations,
+and hence reduces the amortized overhead. The above ``main`` function runs the
+same code on both batch and no-batch modes using different numbers of observers,
+ranging from 1 to 10. The figure below plots the execution time of different
+world sizes using default argument values. The results confirmed our expectation
+that batch processing helped to speed up training.
+
+
+.. figure:: /_static/img/rpc-images/batch.png
+    :alt:
+
+Learn More
+----------
+
+-  `Batch-Updating Parameter Server Source Code <https://github.com/pytorch/examples/blob/master/distributed/rpc/batch/parameter_server.py>`__
+-  `Batch-Processing CartPole Solver <https://github.com/pytorch/examples/blob/master/distributed/rpc/batch/reinforce.py>`__
+-  `Distributed Autograd <https://pytorch.org/docs/master/rpc.html#distributed-autograd-framework>`__
diff --git a/intermediate_source/rpc_param_server_tutorial.rst b/intermediate_source/rpc_param_server_tutorial.rst
new file mode 100644
index 00000000000..324331646c5
--- /dev/null
+++ b/intermediate_source/rpc_param_server_tutorial.rst
@@ -0,0 +1,386 @@
+
+Implementing a Parameter Server Using Distributed RPC Framework
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+**Author**\ : `Rohan Varma <https://github.com/rohan-varma>`_
+
+.. note::
+   |edit| View and edit this tutorial in `github <https://github.com/pytorch/tutorials/blob/main/intermediate_source/rpc_param_server_tutorial.rst>`__.
+
+Prerequisites:
+
+-  `PyTorch Distributed Overview <../beginner/dist_overview.html>`__
+-  `RPC API documents <https://pytorch.org/docs/master/rpc.html>`__
+
+This tutorial walks through a simple example of implementing a parameter server using PyTorch's `Distributed RPC framework <https://pytorch.org/docs/stable/rpc.html>`_. The parameter server framework is a paradigm in which a set of servers store parameters, such as large embedding tables, and several trainers query the parameter servers in order to retrieve the most up to date parameters. These trainers can run a training loop locally and occasionally synchronize with the parameter server to get the latest parameters. For more reading on the parameter server approach, check out `this paper <https://www.cs.cmu.edu/~muli/file/parameter_server_osdi14.pdf>`_.
+
+Using the Distributed RPC Framework, we'll build an example where multiple trainers use RPC to communicate with the same parameter server and use `RRef <https://pytorch.org/docs/stable/rpc.html#torch.distributed.rpc.RRef>`_ to access states on the remote parameter server instance. Each trainer will launch its dedicated backward pass in a distributed fashion through stitching of the autograd graph across multiple nodes using distributed autograd.
+
+**Note**\ : This tutorial covers the use of the Distributed RPC Framework, which is useful for splitting a model onto multiple machines, or for implementing a parameter-server training strategy where network trainers fetch parameters hosted on a different machine. If instead you are looking for replicating your model across many GPUs, please see the `Distributed Data Parallel tutorial <https://pytorch.org/tutorials/intermediate/ddp_tutorial.html>`_. There is also another `RPC tutorial <https://pytorch.org/tutorials/intermediate/rpc_tutorial.html>`_ that covers reinforcement learning and RNN use cases.
+
+Let's start with the familiar: importing our required modules and defining a simple ConvNet that will train on the MNIST dataset. The below network is largely adopted from the network defined in the `pytorch/examples repo <https://github.com/pytorch/examples/tree/master/mnist>`_.
+
+.. code-block:: python
+
+   import argparse
+   import os
+   import time
+   from threading import Lock
+
+   import torch
+   import torch.distributed.autograd as dist_autograd
+   import torch.distributed.rpc as rpc
+   import torch.multiprocessing as mp
+   import torch.nn as nn
+   import torch.nn.functional as F
+   from torch import optim
+   from torch.distributed.optim import DistributedOptimizer
+   from torchvision import datasets, transforms
+
+   # --------- MNIST Network to train, from pytorch/examples -----
+
+   class Net(nn.Module):
+       def __init__(self, num_gpus=0):
+           super(Net, self).__init__()
+           print(f"Using {num_gpus} GPUs to train")
+           self.num_gpus = num_gpus
+           device = torch.device(
+               "cuda:0" if torch.cuda.is_available() and self.num_gpus > 0 else "cpu")
+           print(f"Putting first 2 convs on {str(device)}")
+           # Put conv layers on the first cuda device, or CPU if no cuda device
+           self.conv1 = nn.Conv2d(1, 32, 3, 1).to(device)
+           self.conv2 = nn.Conv2d(32, 64, 3, 1).to(device)
+           # Put rest of the network on the 2nd cuda device, if there is one
+           if "cuda" in str(device) and num_gpus > 1:
+               device = torch.device("cuda:1")
+
+           print(f"Putting rest of layers on {str(device)}")
+           self.dropout1 = nn.Dropout2d(0.25).to(device)
+           self.dropout2 = nn.Dropout2d(0.5).to(device)
+           self.fc1 = nn.Linear(9216, 128).to(device)
+           self.fc2 = nn.Linear(128, 10).to(device)
+
+       def forward(self, x):
+           x = self.conv1(x)
+           x = F.relu(x)
+           x = self.conv2(x)
+           x = F.max_pool2d(x, 2)
+
+           x = self.dropout1(x)
+           x = torch.flatten(x, 1)
+           # Move tensor to next device if necessary
+           next_device = next(self.fc1.parameters()).device
+           x = x.to(next_device)
+
+           x = self.fc1(x)
+           x = F.relu(x)
+           x = self.dropout2(x)
+           x = self.fc2(x)
+           output = F.log_softmax(x, dim=1)
+           return output
+Next, let's define some helper functions that will be useful for the rest of our script. The following uses `rpc_sync <https://pytorch.org/docs/stable/rpc.html#torch.distributed.rpc.rpc_sync>`_ and `RRef <https://pytorch.org/docs/stable/rpc.html#torch.distributed.rpc.RRef>`_ in order to define a function that invokes a given method on an object living on a remote node. Below, our handle to the remote object is given by the ``rref`` argument, and we run it on its owning node: ``rref.owner()``. On the caller node, we run this command synchronously through the use of ``rpc_sync``\ , meaning that we will block until a response is received.
+
+.. code-block:: python
+
+   # --------- Helper Methods --------------------
+
+   # On the local node, call a method with first arg as the value held by the
+   # RRef. Other args are passed in as arguments to the function called.
+   # Useful for calling instance methods. method could be any matching function, including
+   # class methods.
+   def call_method(method, rref, *args, **kwargs):
+       return method(rref.local_value(), *args, **kwargs)
+
+   # Given an RRef, return the result of calling the passed in method on the value
+   # held by the RRef. This call is done on the remote node that owns
+   # the RRef and passes along the given argument.
+   # Example: If the value held by the RRef is of type Foo, then
+   # remote_method(Foo.bar, rref, arg1, arg2) is equivalent to calling
+   # <foo_instance>.bar(arg1, arg2) on the remote node and getting the result
+   # back.
+
+   def remote_method(method, rref, *args, **kwargs):
+       args = [method, rref] + list(args)
+       return rpc.rpc_sync(rref.owner(), call_method, args=args, kwargs=kwargs)
+Now, we're ready to define our parameter server. We will subclass ``nn.Module`` and save a handle to our network defined above. We'll also save an input device which will be the device our input is transferred to before invoking the model.
+
+.. code-block:: python
+
+   # --------- Parameter Server --------------------
+   class ParameterServer(nn.Module):
+       def __init__(self, num_gpus=0):
+           super().__init__()
+           model = Net(num_gpus=num_gpus)
+           self.model = model
+           self.input_device = torch.device(
+               "cuda:0" if torch.cuda.is_available() and num_gpus > 0 else "cpu")
+Next, we'll define our forward pass. Note that regardless of the device of the model output, we move the output to CPU, as the Distributed RPC Framework currently only supports sending CPU tensors over RPC. We have intentionally disabled sending CUDA tensors over RPC due to the potential for different devices (CPU/GPU) on on the caller/callee, but may support this in future releases.
+
+.. code-block:: python
+
+   class ParameterServer(nn.Module):
+   ...
+       def forward(self, inp):
+           inp = inp.to(self.input_device)
+           out = self.model(inp)
+           # This output is forwarded over RPC, which as of 1.5.0 only accepts CPU tensors.
+           # Tensors must be moved in and out of GPU memory due to this.
+           out = out.to("cpu")
+           return out
+Next, we'll define a few miscellaneous functions useful for training and verification purposes. The first, ``get_dist_gradients``\ , will take in a Distributed Autograd context ID and call into the ``dist_autograd.get_gradients`` API in order to retrieve gradients computed by distributed autograd. More information can be found in the `distributed autograd documentation <https://pytorch.org/docs/stable/rpc.html#distributed-autograd-framework>`_. Note that we also iterate through the resulting dictionary and convert each tensor to a CPU tensor, as the framework currently only supports sending tensors over RPC. Next, ``get_param_rrefs`` will iterate through our model parameters and wrap them as a (local) `RRef <https://pytorch.org/docs/stable/rpc.html#torch.distributed.rpc.RRef>`_. This method will be invoked over RPC by trainer nodes and will return a list of the parameters to be optimized. This is required as input to the `Distributed Optimizer <https://pytorch.org/docs/stable/rpc.html#module-torch.distributed.optim>`_\ , which requires all parameters it must optimize as a list of ``RRef``\ s.
+
+.. code-block:: python
+
+   # Use dist autograd to retrieve gradients accumulated for this model.
+   # Primarily used for verification.
+   def get_dist_gradients(self, cid):
+       grads = dist_autograd.get_gradients(cid)
+       # This output is forwarded over RPC, which as of 1.5.0 only accepts CPU tensors.
+       # Tensors must be moved in and out of GPU memory due to this.
+       cpu_grads = {}
+       for k, v in grads.items():
+           k_cpu, v_cpu = k.to("cpu"), v.to("cpu")
+           cpu_grads[k_cpu] = v_cpu
+       return cpu_grads
+
+   # Wrap local parameters in a RRef. Needed for building the
+   # DistributedOptimizer which optimizes paramters remotely.
+   def get_param_rrefs(self):
+       param_rrefs = [rpc.RRef(param) for param in self.model.parameters()]
+       return param_rrefs
+Finally, we'll create methods to initialize our parameter server. Note that there will only be one instance of a parameter server across all processes, and all trainers will talk to the same parameter server and update the same stored model. As seen in ``run_parameter_server``\ , the server itself does not take any independent actions; it waits for requests from trainers (which are yet to be defined) and responds to them by running the requested function.
+
+.. code-block:: python
+
+   # The global parameter server instance.
+   param_server = None
+   # A lock to ensure we only have one parameter server.
+   global_lock = Lock()
+
+
+   def get_parameter_server(num_gpus=0):
+       """
+       Returns a singleton parameter server to all trainer processes
+       """
+       global param_server
+       # Ensure that we get only one handle to the ParameterServer.
+       with global_lock:
+           if not param_server:
+               # construct it once
+               param_server = ParameterServer(num_gpus=num_gpus)
+           return param_server
+
+   def run_parameter_server(rank, world_size):
+       # The parameter server just acts as a host for the model and responds to
+       # requests from trainers.
+       # rpc.shutdown() will wait for all workers to complete by default, which
+       # in this case means that the parameter server will wait for all trainers
+       # to complete, and then exit.
+       print("PS master initializing RPC")
+       rpc.init_rpc(name="parameter_server", rank=rank, world_size=world_size)
+       print("RPC initialized! Running parameter server...")
+       rpc.shutdown()
+       print("RPC shutdown on parameter server.")
+Note that above, ``rpc.shutdown()`` will not immediately shut down the Parameter Server. Instead, it will wait for all workers (trainers in this case) to also call into ``rpc.shutdown()``. This gives us the guarantee that the parameter server will not go offline before all trainers (yet to be define) have completed their training process.
+
+Next, we'll define our ``TrainerNet`` class. This will also be a subclass of ``nn.Module``\ , and our ``__init__`` method will use the ``rpc.remote`` API to obtain an RRef, or Remote Reference, to our parameter server. Note that here we are not copying the parameter server to our local process, instead, we can think of ``self.param_server_rref`` as a distributed shared pointer to the parameter server that lives on a separate process.
+
+.. code-block:: python
+
+   # --------- Trainers --------------------
+
+   # nn.Module corresponding to the network trained by this trainer. The
+   # forward() method simply invokes the network on the given parameter
+   # server.
+   class TrainerNet(nn.Module):
+       def __init__(self, num_gpus=0):
+           super().__init__()
+           self.num_gpus = num_gpus
+           self.param_server_rref = rpc.remote(
+               "parameter_server", get_parameter_server, args=(num_gpus,))
+Next, we'll define a method called ``get_global_param_rrefs``. To motivate the need for this method, it is worth it to read through the documentation on `DistributedOptimizer <https://pytorch.org/docs/stable/rpc.html#module-torch.distributed.optim>`_, specifically the API signature.  The optimizer must be passed a list of ``RRef``\ s corresponding to the remote parameters to be optimized, so here we obtain the necessary ``RRef``\ s. Since the only remote worker that a given ``TrainerNet`` interacts with is the ``ParameterServer``\ , we simply invoke a ``remote_method`` on the ``ParameterServer``. We use the ``get_param_rrefs`` method which we defined in the ``ParameterServer`` class. This method will return a list of ``RRef``\ s to the parameters that need to be optimized. Note that in this case our ``TrainerNet`` does not define its own paramaters; if it did, we would need to wrap each parameter in an ``RRef`` as well and include it into our input to ``DistributedOptimizer``.
+
+.. code-block:: python
+
+   class TrainerNet(nn.Module):
+   ...
+       def get_global_param_rrefs(self):
+           remote_params = remote_method(
+               ParameterServer.get_param_rrefs,
+               self.param_server_rref)
+           return remote_params
+Now, we're ready to define our ``forward`` method, which will invoke (synchronous) RPC to run the forward pass of the network defined on the ``ParameterServer``. Note that we pass in ``self.param_server_rref``\ , which is a remote handle to our ``ParameterServer``\ , to our RPC call. This call will send an RPC to the node on which our ``ParameterServer`` is running, invoke the ``forward`` pass, and return the ``Tensor`` corresponding to the model's output.
+
+.. code-block:: python
+
+   class TrainerNet(nn.Module):
+   ...
+       def forward(self, x):
+           model_output = remote_method(
+               ParameterServer.forward, self.param_server_rref, x)
+           return model_output
+With our trainer fully defined, it's now time to write our neural network training loop that will create our network and optimizer, run some inputs through the network and compute the loss. The training loop looks a lot like that of a local training program, with some modifications due to the nature of our network being distributed across machines.
+
+Below, we initialize our ``TrainerNet`` and build a ``DistributedOptimizer``. Note that as mentioned above, we must pass in all of the global (across all nodes participating in distributed training) parameters that we want to be optimized. In addition, we pass in the local optimizer to be used, in this case, SGD. Note that we can configure the underlying optimizer algorithm in the same way as creating a local optimizer - all arguments for ``optimizer.SGD`` will be forwarded properly. As an example, we pass in a custom learning rate that will be used as the learning rate for all local optimizers.
+
+.. code-block:: python
+
+   def run_training_loop(rank, num_gpus, train_loader, test_loader):
+       # Runs the typical nueral network forward + backward + optimizer step, but
+       # in a distributed fashion.
+       net = TrainerNet(num_gpus=num_gpus)
+       # Build DistributedOptimizer.
+       param_rrefs = net.get_global_param_rrefs()
+       opt = DistributedOptimizer(optim.SGD, param_rrefs, lr=0.03)
+Next, we define our main training loop. We loop through iterables given by PyTorch's `DataLoader <https://pytorch.org/docs/stable/data.html>`_. Before writing our typical forward/backward/optimizer loop, we first wrap the logic within a `Distributed Autograd context <https://pytorch.org/docs/stable/rpc.html#torch.distributed.autograd.context>`_. Note that this is needed to record RPCs invoked in the model's forward pass, so that an appropriate graph can be constructed which includes all participating distributed workers in the backward pass. The distributed autograd context returns a ``context_id`` which serves as an identifier for accumulating and optimizing gradients corresponding to a particular iteration.
+
+As opposed to calling the typical ``loss.backward()`` which would kick off the backward pass on this local worker, we call ``dist_autograd.backward()`` and pass in our context_id as well as ``loss``\ , which is the root at which we want the backward pass to begin. In addition, we pass this ``context_id`` into our optimizer call, which is required to be able to look up the corresponding gradients computed by this particular backwards pass across all nodes.
+
+.. code-block:: python
+
+   def run_training_loop(rank, num_gpus, train_loader, test_loader):
+   ...
+       for i, (data, target) in enumerate(train_loader):
+           with dist_autograd.context() as cid:
+               model_output = net(data)
+               target = target.to(model_output.device)
+               loss = F.nll_loss(model_output, target)
+               if i % 5 == 0:
+                   print(f"Rank {rank} training batch {i} loss {loss.item()}")
+               dist_autograd.backward(cid, [loss])
+               # Ensure that dist autograd ran successfully and gradients were
+               # returned.
+               assert remote_method(
+                   ParameterServer.get_dist_gradients,
+                   net.param_server_rref,
+                   cid) != {}
+               opt.step(cid)
+
+        print("Training complete!")
+        print("Getting accuracy....")
+        get_accuracy(test_loader, net)
+The following simply computes the accuracy of our model after we're done training, much like a traditional local model. However, note that the ``net`` we pass into this function above is an instance of ``TrainerNet`` and therefore the forward pass invokes RPC in a transparent fashion.
+
+.. code-block:: python
+
+   def get_accuracy(test_loader, model):
+       model.eval()
+       correct_sum = 0
+       # Use GPU to evaluate if possible
+       device = torch.device("cuda:0" if model.num_gpus > 0
+           and torch.cuda.is_available() else "cpu")
+       with torch.no_grad():
+           for i, (data, target) in enumerate(test_loader):
+               out = model(data, -1)
+               pred = out.argmax(dim=1, keepdim=True)
+               pred, target = pred.to(device), target.to(device)
+               correct = pred.eq(target.view_as(pred)).sum().item()
+               correct_sum += correct
+
+       print(f"Accuracy {correct_sum / len(test_loader.dataset)}")
+Next, similar to how we defined ``run_parameter_server`` as the main loop for our ``ParameterServer`` that is responsible for initializing RPC, let's define a similar loop for our trainers. The difference will be that our trainers must run the training loop we defined above:
+
+.. code-block:: python
+
+   # Main loop for trainers.
+   def run_worker(rank, world_size, num_gpus, train_loader, test_loader):
+       print(f"Worker rank {rank} initializing RPC")
+       rpc.init_rpc(
+           name=f"trainer_{rank}",
+           rank=rank,
+           world_size=world_size)
+
+       print(f"Worker {rank} done initializing RPC")
+
+       run_training_loop(rank, num_gpus, train_loader, test_loader)
+       rpc.shutdown()
+Note that similar to ``run_parameter_server``\ , ``rpc.shutdown()`` will by default wait for all workers, both trainers and ParameterServers, to call into ``rpc.shutdown()`` before this node exits. This ensures that nodes are terminated gracefully and no node goes offline while another is expecting it to be online.
+
+We've now completed our trainer and parameter server specific code, and all that's left is to add code to launch trainers and parameter servers. First, we must take in various arguments that apply to our parameter server and trainers. ``world_size`` corresponds to the total number of nodes that will participate in training, and is the sum of all trainers and the parameter server. We also must pass in a unique ``rank`` for each individual process, from 0 (where we will run our single parameter server) to ``world_size - 1``. ``master_addr`` and ``master_port`` are arguments that can be used to identify where the rank 0 process is running, and will be used by individual nodes to discover each other. To test this example out locally, simply pass in ``localhost`` and the same ``master_port`` to all instances spawned. Note that for demonstration purposes, this example supports only between 0-2 GPUs, although the pattern can be extended to make use of additional GPUs.
+
+.. code-block:: python
+
+   if __name__ == '__main__':
+       parser = argparse.ArgumentParser(
+           description="Parameter-Server RPC based training")
+       parser.add_argument(
+           "--world_size",
+           type=int,
+           default=4,
+           help="""Total number of participating processes. Should be the sum of
+           master node and all training nodes.""")
+       parser.add_argument(
+           "--rank",
+           type=int,
+           default=None,
+           help="Global rank of this process. Pass in 0 for master.")
+       parser.add_argument(
+           "--num_gpus",
+           type=int,
+           default=0,
+           help="""Number of GPUs to use for training, Currently supports between 0
+            and 2 GPUs. Note that this argument will be passed to the parameter servers.""")
+       parser.add_argument(
+           "--master_addr",
+           type=str,
+           default="localhost",
+           help="""Address of master, will default to localhost if not provided.
+           Master must be able to accept network traffic on the address + port.""")
+       parser.add_argument(
+           "--master_port",
+           type=str,
+           default="29500",
+           help="""Port that master is listening on, will default to 29500 if not
+           provided. Master must be able to accept network traffic on the host and port.""")
+
+       args = parser.parse_args()
+       assert args.rank is not None, "must provide rank argument."
+       assert args.num_gpus <= 3, f"Only 0-2 GPUs currently supported (got {args.num_gpus})."
+       os.environ['MASTER_ADDR'] = args.master_addr
+       os.environ["MASTER_PORT"] = args.master_port
+Now, we'll create a process corresponding to either a parameter server or trainer depending on our command line arguments. We'll create a ``ParameterServer`` if our passed in rank is 0, and a ``TrainerNet`` otherwise. Note that we're using ``torch.multiprocessing`` to launch a subprocess corresponding to the function that we want to execute, and waiting on this process's completion from the main thread with ``p.join()``. In the case of initializing our trainers, we also use PyTorch's `dataloaders <https://pytorch.org/docs/stable/data.html>`_ in order to specify train and test data loaders on the MNIST dataset.
+
+.. code-block:: python
+
+   processes = []
+   world_size = args.world_size
+   if args.rank == 0:
+       p = mp.Process(target=run_parameter_server, args=(0, world_size))
+       p.start()
+       processes.append(p)
+   else:
+       # Get data to train on
+       train_loader = torch.utils.data.DataLoader(
+           datasets.MNIST('../data', train=True, download=True,
+                          transform=transforms.Compose([
+                              transforms.ToTensor(),
+                              transforms.Normalize((0.1307,), (0.3081,))
+                          ])),
+           batch_size=32, shuffle=True,)
+       test_loader = torch.utils.data.DataLoader(
+           datasets.MNIST(
+               '../data',
+               train=False,
+               transform=transforms.Compose([
+                       transforms.ToTensor(),
+                       transforms.Normalize((0.1307,), (0.3081,))
+                           ])),
+           batch_size=32,
+           shuffle=True,
+       )
+       # start training worker on this node
+       p = mp.Process(
+           target=run_worker,
+           args=(
+               args.rank,
+               world_size, args.num_gpus,
+               train_loader,
+               test_loader))
+       p.start()
+       processes.append(p)
+
+   for p in processes:
+       p.join()
+To run the example locally, run the following command worker for the server and each worker you wish to spawn, in separate terminal windows: ``python rpc_parameter_server.py --world_size=WORLD_SIZE --rank=RANK``. For example, for a master node with world size of 2, the command would be ``python rpc_parameter_server.py --world_size=2 --rank=0``. The trainer can then be launched with the command ``python rpc_parameter_server.py --world_size=2 --rank=1`` in a separate window, and this will begin training with one server and a single trainer. Note that this tutorial assumes that training occurs using between 0 and 2 GPUs, and this argument can be configured by passing ``--num_gpus=N`` into the training script.
+
+You can pass in the command line arguments ``--master_addr=ADDRESS`` and ``--master_port=PORT`` to indicate the address and port that the master worker is listening on, for example, to test functionality where trainers and master nodes run on different machines.
diff --git a/intermediate_source/rpc_tutorial.rst b/intermediate_source/rpc_tutorial.rst
new file mode 100644
index 00000000000..791ecf86d35
--- /dev/null
+++ b/intermediate_source/rpc_tutorial.rst
@@ -0,0 +1,622 @@
+Getting Started with Distributed RPC Framework
+=================================================
+**Author**: `Shen Li <https://mrshenli.github.io/>`_
+
+.. note::
+   |edit| View and edit this tutorial in `github <https://github.com/pytorch/tutorials/blob/main/intermediate_source/rpc_tutorial.rst>`__.
+
+Prerequisites:
+
+-  `PyTorch Distributed Overview <../beginner/dist_overview.html>`__
+-  `RPC API documents <https://pytorch.org/docs/master/rpc.html>`__
+
+This tutorial uses two simple examples to demonstrate how to build distributed
+training with the `torch.distributed.rpc <https://pytorch.org/docs/stable/rpc.html>`__
+package which was first introduced as an experimental feature in PyTorch v1.4.
+Source code of the two examples can be found in
+`PyTorch examples <https://github.com/pytorch/examples>`__.
+
+Previous tutorials,
+`Getting Started With Distributed Data Parallel <ddp_tutorial.html>`__
+and `Writing Distributed Applications With PyTorch <dist_tuto.html>`__,
+described `DistributedDataParallel <https://docs.pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html>`__
+which supports a specific training paradigm where the model is replicated across
+multiple processes and each process handles a split of the input data.
+Sometimes, you might run into scenarios that require different training
+paradigms. For example:
+
+1) In reinforcement learning, it might be relatively expensive to acquire
+   training data from environments while the model itself can be quite small. In
+   this case, it might be useful to spawn multiple observers running in parallel
+   and share a single agent. In this case, the agent takes care of the training
+   locally, but the application would still need libraries to send and receive
+   data between observers and the trainer.
+2) Your model might be too large to fit in GPUs on a single machine, and hence
+   would need a library to help split the model onto multiple machines. Or you
+   might be implementing a `parameter server <https://www.cs.cmu.edu/~muli/file/parameter_server_osdi14.pdf>`__
+   training framework, where model parameters and trainers live on different
+   machines.
+
+
+The `torch.distributed.rpc <https://pytorch.org/docs/stable/rpc.html>`__ package
+can help with the above scenarios. In case 1, `RPC <https://pytorch.org/docs/stable/rpc.html#rpc>`__
+and `RRef <https://pytorch.org/docs/stable/rpc.html#rref>`__ allow sending data
+from one worker to another while easily referencing remote data objects. In
+case 2, `distributed autograd <https://pytorch.org/docs/stable/rpc.html#distributed-autograd-framework>`__
+and `distributed optimizer <https://pytorch.org/docs/stable/rpc.html#module-torch.distributed.optim>`__
+make executing backward pass and optimizer step as if it is local training. In
+the next two sections, we will demonstrate APIs of
+`torch.distributed.rpc <https://pytorch.org/docs/stable/rpc.html>`__ using a
+reinforcement learning example and a language model example. Please note, this
+tutorial does not aim at building the most accurate or efficient models to
+solve given problems, instead, the main goal here is to show how to use the
+`torch.distributed.rpc <https://pytorch.org/docs/stable/rpc.html>`__ package to
+build distributed training applications.
+
+
+
+Distributed Reinforcement Learning using RPC and RRef
+-----------------------------------------------------
+
+This section describes steps to build a toy distributed reinforcement learning
+model using RPC to solve CartPole-v1 from `OpenAI Gym <https://www.gymlibrary.dev/environments/classic_control/cart_pole/>`__.
+The policy code is mostly borrowed from the existing single-thread
+`example <https://github.com/pytorch/examples/blob/master/reinforcement_learning>`__
+as shown below. We will skip details of the ``Policy`` design, and focus on RPC
+usages.
+
+.. code:: python
+
+    import torch.nn as nn
+    import torch.nn.functional as F
+
+    class Policy(nn.Module):
+
+        def __init__(self):
+            super(Policy, self).__init__()
+            self.affine1 = nn.Linear(4, 128)
+            self.dropout = nn.Dropout(p=0.6)
+            self.affine2 = nn.Linear(128, 2)
+
+        def forward(self, x):
+            x = self.affine1(x)
+            x = self.dropout(x)
+            x = F.relu(x)
+            action_scores = self.affine2(x)
+            return F.softmax(action_scores, dim=1)
+
+
+We are ready to present the observer. In this example, each observer creates its
+own environment, and waits for the agent's command to run an episode. In each
+episode, one observer loops at most ``n_steps`` iterations, and in each
+iteration, it uses RPC to pass its environment state to the agent and gets an
+action back. Then it applies that action to its environment, and gets the reward
+and the next state from the environment. After that, the observer uses another
+RPC to report the reward to the agent. Again, please note that, this is
+obviously not the most efficient observer implementation. For example, one
+simple optimization could be packing current state and last reward in one RPC to
+reduce the communication overhead. However, the goal is to demonstrate RPC API
+instead of building the best solver for CartPole. So, let's keep the logic
+simple and the two steps explicit in this example.
+
+.. code:: python
+
+    import argparse
+    import gym
+    import torch.distributed.rpc as rpc
+
+    parser = argparse.ArgumentParser(
+        description="RPC Reinforcement Learning Example",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+
+    parser.add_argument('--world_size', default=2, type=int, metavar='W',
+                        help='number of workers')
+    parser.add_argument('--log_interval', type=int, default=10, metavar='N',
+                        help='interval between training status logs')
+    parser.add_argument('--gamma', type=float, default=0.99, metavar='G',
+                        help='how much to value future rewards')
+    parser.add_argument('--seed', type=int, default=1, metavar='S',
+                        help='random seed  for reproducibility')
+    args = parser.parse_args()
+
+    class Observer:
+
+        def __init__(self):
+            self.id = rpc.get_worker_info().id
+            self.env = gym.make('CartPole-v1')
+            self.env.seed(args.seed)
+
+        def run_episode(self, agent_rref):
+            state, ep_reward = self.env.reset(), 0
+            for _ in range(10000):
+                # send the state to the agent to get an action
+                action = agent_rref.rpc_sync().select_action(self.id, state)
+
+                # apply the action to the environment, and get the reward
+                state, reward, done, _ = self.env.step(action)
+
+                # report the reward to the agent for training purpose
+                agent_rref.rpc_sync().report_reward(self.id, reward)
+
+                # finishes after the number of self.env._max_episode_steps
+                if done:
+                    break
+
+
+The code for agent is a little more complex, and we will break it into multiple
+pieces. In this example, the agent serves as both the trainer and the master,
+such that it sends command to multiple distributed observers to run episodes,
+and it also records all actions and rewards locally which will be used during
+the training phase after each episode. The code below shows ``Agent``
+constructor where most lines are initializing various components. The loop at
+the end initializes observers remotely on other workers, and holds ``RRefs`` to
+those observers locally. The agent will use those observer ``RRefs`` later to
+send commands. Applications don't need to worry about the lifetime of ``RRefs``.
+The owner of each ``RRef`` maintains a reference counting map to track its
+lifetime, and guarantees the remote data object will not be deleted as long as
+there is any live user of that ``RRef``. Please refer to the ``RRef``
+`design doc <https://pytorch.org/docs/stable/rpc/rref.html>`__ for details.
+
+
+.. code:: python
+
+    import gym
+    import numpy as np
+
+    import torch
+    import torch.distributed.rpc as rpc
+    import torch.optim as optim
+    from torch.distributed.rpc import RRef, rpc_async, remote
+    from torch.distributions import Categorical
+
+    class Agent:
+        def __init__(self, world_size):
+            self.ob_rrefs = []
+            self.agent_rref = RRef(self)
+            self.rewards = {}
+            self.saved_log_probs = {}
+            self.policy = Policy()
+            self.optimizer = optim.Adam(self.policy.parameters(), lr=1e-2)
+            self.eps = np.finfo(np.float32).eps.item()
+            self.running_reward = 0
+            self.reward_threshold = gym.make('CartPole-v1').spec.reward_threshold
+            for ob_rank in range(1, world_size):
+                ob_info = rpc.get_worker_info(OBSERVER_NAME.format(ob_rank))
+                self.ob_rrefs.append(remote(ob_info, Observer))
+                self.rewards[ob_info.id] = []
+                self.saved_log_probs[ob_info.id] = []
+
+
+Next, the agent exposes two APIs to observers for selecting actions and
+reporting rewards. Those functions only run locally on the agent, but will
+be triggered by observers through RPC.
+
+
+.. code:: python
+
+    class Agent:
+        ...
+        def select_action(self, ob_id, state):
+            state = torch.from_numpy(state).float().unsqueeze(0)
+            probs = self.policy(state)
+            m = Categorical(probs)
+            action = m.sample()
+            self.saved_log_probs[ob_id].append(m.log_prob(action))
+            return action.item()
+
+        def report_reward(self, ob_id, reward):
+            self.rewards[ob_id].append(reward)
+
+
+Let's add a ``run_episode`` function on agent which tells all observers
+to execute an episode. In this function, it first creates a list to collect
+futures from asynchronous RPCs, and then loop over all observer ``RRefs`` to
+make asynchronous RPCs. In these RPCs, the agent also passes an ``RRef`` of
+itself to the observer, so that the observer can call functions on the agent as
+well. As shown above, each observer will make RPCs back to the agent, which are
+nested RPCs. After each episode, the ``saved_log_probs`` and ``rewards`` will
+contain the recorded action probs and rewards.
+
+
+.. code:: python
+
+    class Agent:
+        ...
+        def run_episode(self):
+            futs = []
+            for ob_rref in self.ob_rrefs:
+                # make async RPC to kick off an episode on all observers
+                futs.append(
+                    rpc_async(
+                        ob_rref.owner(),
+                        ob_rref.rpc_sync().run_episode,
+                        args=(self.agent_rref,)
+                    )
+                )
+
+            # wait until all obervers have finished this episode
+            for fut in futs:
+                fut.wait()
+
+
+Finally, after one episode, the agent needs to train the model, which
+is implemented in the ``finish_episode`` function below. There is no RPCs in
+this function and it is mostly borrowed from the single-thread
+`example <https://github.com/pytorch/examples/blob/master/reinforcement_learning>`__.
+Hence, we skip describing its contents.
+
+
+
+.. code:: python
+
+    class Agent:
+        ...
+        def finish_episode(self):
+          # joins probs and rewards from different observers into lists
+          R, probs, rewards = 0, [], []
+          for ob_id in self.rewards:
+              probs.extend(self.saved_log_probs[ob_id])
+              rewards.extend(self.rewards[ob_id])
+
+          # use the minimum observer reward to calculate the running reward
+          min_reward = min([sum(self.rewards[ob_id]) for ob_id in self.rewards])
+          self.running_reward = 0.05 * min_reward + (1 - 0.05) * self.running_reward
+
+          # clear saved probs and rewards
+          for ob_id in self.rewards:
+              self.rewards[ob_id] = []
+              self.saved_log_probs[ob_id] = []
+
+          policy_loss, returns = [], []
+          for r in rewards[::-1]:
+              R = r + args.gamma * R
+              returns.insert(0, R)
+          returns = torch.tensor(returns)
+          returns = (returns - returns.mean()) / (returns.std() + self.eps)
+          for log_prob, R in zip(probs, returns):
+              policy_loss.append(-log_prob * R)
+          self.optimizer.zero_grad()
+          policy_loss = torch.cat(policy_loss).sum()
+          policy_loss.backward()
+          self.optimizer.step()
+          return min_reward
+
+
+With ``Policy``, ``Observer``, and ``Agent`` classes, we are ready to launch
+multiple processes to perform the distributed training. In this example, all
+processes run the same ``run_worker`` function, and they use the rank to
+distinguish their role. Rank 0 is always the agent, and all other ranks are
+observers. The agent serves as master by repeatedly calling ``run_episode`` and
+``finish_episode`` until the running reward surpasses the reward threshold
+specified by the environment. All observers passively waiting for commands
+from the agent. The code is wrapped by
+`rpc.init_rpc <https://pytorch.org/docs/stable/rpc.html#torch.distributed.rpc.init_rpc>`__ and
+`rpc.shutdown <https://pytorch.org/docs/stable/rpc.html#torch.distributed.rpc.shutdown>`__,
+which initializes and terminates RPC instances respectively. More details are
+available in the `API page <https://pytorch.org/docs/stable/rpc.html>`__.
+
+
+.. code:: python
+
+    import os
+    from itertools import count
+
+    import torch.multiprocessing as mp
+
+    AGENT_NAME = "agent"
+    OBSERVER_NAME="obs{}"
+
+    def run_worker(rank, world_size):
+        os.environ['MASTER_ADDR'] = 'localhost'
+        os.environ['MASTER_PORT'] = '29500'
+        if rank == 0:
+            # rank0 is the agent
+            rpc.init_rpc(AGENT_NAME, rank=rank, world_size=world_size)
+
+            agent = Agent(world_size)
+            print(f"This will run until reward threshold of {agent.reward_threshold}"
+                    " is reached. Ctrl+C to exit.")
+            for i_episode in count(1):
+                agent.run_episode()
+                last_reward = agent.finish_episode()
+
+                if i_episode % args.log_interval == 0:
+                    print(f"Episode {i_episode}\tLast reward: {last_reward:.2f}\tAverage reward: "
+                        f"{agent.running_reward:.2f}")
+                if agent.running_reward > agent.reward_threshold:
+                    print(f"Solved! Running reward is now {agent.running_reward}!")
+                    break
+        else:
+            # other ranks are the observer
+            rpc.init_rpc(OBSERVER_NAME.format(rank), rank=rank, world_size=world_size)
+            # observers passively waiting for instructions from the agent
+
+        # block until all rpcs finish, and shutdown the RPC instance
+        rpc.shutdown()
+
+
+    mp.spawn(
+        run_worker,
+        args=(args.world_size, ),
+        nprocs=args.world_size,
+        join=True
+    )
+
+Below are some sample outputs when training with `world_size=2`.
+
+::
+
+    This will run until reward threshold of 475.0 is reached. Ctrl+C to exit.
+    Episode 10      Last reward: 26.00      Average reward: 10.01
+    Episode 20      Last reward: 16.00      Average reward: 11.27
+    Episode 30      Last reward: 49.00      Average reward: 18.62
+    Episode 40      Last reward: 45.00      Average reward: 26.09
+    Episode 50      Last reward: 44.00      Average reward: 30.03
+    Episode 60      Last reward: 111.00     Average reward: 42.23
+    Episode 70      Last reward: 131.00     Average reward: 70.11
+    Episode 80      Last reward: 87.00      Average reward: 76.51
+    Episode 90      Last reward: 86.00      Average reward: 95.93
+    Episode 100     Last reward: 13.00      Average reward: 123.93
+    Episode 110     Last reward: 33.00      Average reward: 91.39
+    Episode 120     Last reward: 73.00      Average reward: 76.38
+    Episode 130     Last reward: 137.00     Average reward: 88.08
+    Episode 140     Last reward: 89.00      Average reward: 104.96
+    Episode 150     Last reward: 97.00      Average reward: 98.74
+    Episode 160     Last reward: 150.00     Average reward: 100.87
+    Episode 170     Last reward: 126.00     Average reward: 104.38
+    Episode 180     Last reward: 500.00     Average reward: 213.74
+    Episode 190     Last reward: 322.00     Average reward: 300.22
+    Episode 200     Last reward: 165.00     Average reward: 272.71
+    Episode 210     Last reward: 168.00     Average reward: 233.11
+    Episode 220     Last reward: 184.00     Average reward: 195.02
+    Episode 230     Last reward: 284.00     Average reward: 208.32
+    Episode 240     Last reward: 395.00     Average reward: 247.37
+    Episode 250     Last reward: 500.00     Average reward: 335.42
+    Episode 260     Last reward: 500.00     Average reward: 386.30
+    Episode 270     Last reward: 500.00     Average reward: 405.29
+    Episode 280     Last reward: 500.00     Average reward: 443.29
+    Episode 290     Last reward: 500.00     Average reward: 464.65
+    Solved! Running reward is now 475.3163778435275!
+
+
+In this example, we show how to use RPC as the communication vehicle to pass
+data across workers, and how to use RRef to reference remote objects. It is true
+that you could build the entire structure directly on top of ``ProcessGroup``
+``send`` and ``recv`` APIs or use other communication/RPC libraries. However,
+by using `torch.distributed.rpc`, you can get the native support and
+continuously optimized performance under the hood.
+
+Next, we will show how to combine RPC and RRef with distributed autograd and
+distributed optimizer to perform distributed model parallel training.
+
+
+
+Distributed RNN using Distributed Autograd and Distributed Optimizer
+--------------------------------------------------------------------
+
+In this section, we use an RNN model to show how to build distributed model
+parallel training with the RPC API. The example RNN model is very small and
+can easily fit into a single GPU, but we still divide its layers onto two
+different workers to demonstrate the idea. Developer can apply the similar
+techniques to distribute much larger models across multiple devices and
+machines.
+
+The RNN model design is borrowed from the word language model in PyTorch
+`example <https://github.com/pytorch/examples/tree/master/word_language_model>`__
+repository, which contains three main components, an embedding table, an
+``LSTM`` layer, and a decoder. The code below wraps the embedding table and the
+decoder into sub-modules, so that their constructors can be passed to the RPC
+API. In the ``EmbeddingTable`` sub-module, we intentionally put the
+``Embedding`` layer on GPU to cover the use case. In v1.4, RPC always creates
+CPU tensor arguments or return values on the destination worker. If the function
+takes a GPU tensor, you need to move it to the proper device explicitly.
+
+
+.. code:: python
+
+    class EmbeddingTable(nn.Module):
+        r"""
+        Encoding layers of the RNNModel
+        """
+        def __init__(self, ntoken, ninp, dropout):
+            super(EmbeddingTable, self).__init__()
+            self.drop = nn.Dropout(dropout)
+            self.encoder = nn.Embedding(ntoken, ninp).cuda()
+            self.encoder.weight.data.uniform_(-0.1, 0.1)
+
+        def forward(self, input):
+            return self.drop(self.encoder(input.cuda()).cpu()
+
+
+    class Decoder(nn.Module):
+        def __init__(self, ntoken, nhid, dropout):
+            super(Decoder, self).__init__()
+            self.drop = nn.Dropout(dropout)
+            self.decoder = nn.Linear(nhid, ntoken)
+            self.decoder.bias.data.zero_()
+            self.decoder.weight.data.uniform_(-0.1, 0.1)
+
+        def forward(self, output):
+            return self.decoder(self.drop(output))
+
+
+With the above sub-modules, we can now piece them together using RPC to
+create an RNN model. In the code below ``ps`` represents a parameter server,
+which hosts parameters of the embedding table and the decoder. The constructor
+uses the `remote <https://pytorch.org/docs/stable/rpc.html#torch.distributed.rpc.remote>`__
+API to create an ``EmbeddingTable`` object and a ``Decoder`` object on the
+parameter server, and locally creates the ``LSTM`` sub-module. During the
+forward pass, the trainer uses the ``EmbeddingTable`` ``RRef`` to find the
+remote sub-module and passes the input data to the ``EmbeddingTable`` using RPC
+and fetches the lookup results. Then, it runs the embedding through the local
+``LSTM`` layer, and finally uses another RPC to send the output to the
+``Decoder`` sub-module. In general, to implement distributed model parallel
+training, developers can divide the model into sub-modules, invoke RPC to create
+sub-module instances remotely, and use on ``RRef`` to find them when necessary.
+As you can see in the code below, it looks very similar to single-machine model
+parallel training. The main difference is replacing ``Tensor.to(device)`` with
+RPC functions.
+
+
+.. code:: python
+
+    class RNNModel(nn.Module):
+        def __init__(self, ps, ntoken, ninp, nhid, nlayers, dropout=0.5):
+            super(RNNModel, self).__init__()
+
+            # setup embedding table remotely
+            self.emb_table_rref = rpc.remote(ps, EmbeddingTable, args=(ntoken, ninp, dropout))
+            # setup LSTM locally
+            self.rnn = nn.LSTM(ninp, nhid, nlayers, dropout=dropout)
+            # setup decoder remotely
+            self.decoder_rref = rpc.remote(ps, Decoder, args=(ntoken, nhid, dropout))
+
+        def forward(self, input, hidden):
+            # pass input to the remote embedding table and fetch emb tensor back
+            emb = _remote_method(EmbeddingTable.forward, self.emb_table_rref, input)
+            output, hidden = self.rnn(emb, hidden)
+            # pass output to the rremote decoder and get the decoded output back
+            decoded = _remote_method(Decoder.forward, self.decoder_rref, output)
+            return decoded, hidden
+
+Before introducing the distributed optimizer, let's add a helper function to
+generate a list of RRefs of model parameters, which will be consumed by the
+distributed optimizer. In local training, applications could call
+``Module.parameters()`` to grab references to all parameter tensors, and pass it
+to the local optimizer for subsequent updates. However, the same API does not
+work in distributed training scenarios as some parameters live on remote
+machines. Therefore, instead of taking a list of parameter ``Tensors``, the
+distributed optimizer takes a list of ``RRefs``, one ``RRef`` per model
+parameter for both local and remote model parameters. The helper function is
+pretty simple, just call ``Module.parameters()`` and creates a local ``RRef`` on
+each of the parameters.
+
+
+.. code:: python
+
+    def _parameter_rrefs(module):
+        param_rrefs = []
+        for param in module.parameters():
+            param_rrefs.append(RRef(param))
+        return param_rrefs
+
+
+Then, as the ``RNNModel`` contains three sub-modules, we need to call
+``_parameter_rrefs`` three times, and wrap that into another helper function.
+
+
+.. code:: python
+
+    class RNNModel(nn.Module):
+        ...
+        def parameter_rrefs(self):
+            remote_params = []
+            # get RRefs of embedding table
+            remote_params.extend(_remote_method(_parameter_rrefs, self.emb_table_rref))
+            # create RRefs for local parameters
+            remote_params.extend(_parameter_rrefs(self.rnn))
+            # get RRefs of decoder
+            remote_params.extend(_remote_method(_parameter_rrefs, self.decoder_rref))
+            return remote_params
+
+
+Now, we are ready to implement the training loop. After initializing model
+arguments, we create the ``RNNModel`` and the ``DistributedOptimizer``. The
+distributed optimizer will take a list of parameter ``RRefs``, find all distinct
+owner workers, and create the given local optimizer (i.e., ``SGD`` in this case,
+you can use other local optimizers as well) on each of the owner worker using
+the given arguments (i.e., ``lr=0.05``).
+
+In the training loop, it first creates a distributed autograd context, which
+will help the distributed autograd engine to find gradients and involved RPC
+send/recv functions. The design details of the distributed autograd engine can
+be found in its `design note <https://pytorch.org/docs/stable/rpc/distributed_autograd.html>`__.
+Then, it kicks off the forward pass as if it is a local
+model, and run the distributed backward pass. For the distributed backward, you
+only need to specify a list of roots, in this case, it is the loss ``Tensor``.
+The distributed autograd engine will traverse the distributed graph
+automatically and write gradients properly. Next, it runs the ``step``
+function on the distributed optimizer, which will reach out to all involved
+local optimizers to update model parameters. Compared to local training, one
+minor difference is that you don't need to run ``zero_grad()`` because each
+autograd context has dedicated space to store gradients, and as we create a
+context per iteration, those gradients from different iterations will not
+accumulate to the same set of ``Tensors``.
+
+
+.. code:: python
+
+    def run_trainer():
+        batch = 5
+        ntoken = 10
+        ninp = 2
+
+        nhid = 3
+        nindices = 3
+        nlayers = 4
+        hidden = (
+            torch.randn(nlayers, nindices, nhid),
+            torch.randn(nlayers, nindices, nhid)
+        )
+
+        model = rnn.RNNModel('ps', ntoken, ninp, nhid, nlayers)
+
+        # setup distributed optimizer
+        opt = DistributedOptimizer(
+            optim.SGD,
+            model.parameter_rrefs(),
+            lr=0.05,
+        )
+
+        criterion = torch.nn.CrossEntropyLoss()
+
+        def get_next_batch():
+            for _ in range(5):
+                data = torch.LongTensor(batch, nindices) % ntoken
+                target = torch.LongTensor(batch, ntoken) % nindices
+                yield data, target
+
+        # train for 10 iterations
+        for epoch in range(10):
+            for data, target in get_next_batch():
+                # create distributed autograd context
+                with dist_autograd.context() as context_id:
+                    hidden[0].detach_()
+                    hidden[1].detach_()
+                    output, hidden = model(data, hidden)
+                    loss = criterion(output, target)
+                    # run distributed backward pass
+                    dist_autograd.backward(context_id, [loss])
+                    # run distributed optimizer
+                    opt.step(context_id)
+                    # not necessary to zero grads since they are
+                    # accumulated into the distributed autograd context
+                    # which is reset every iteration.
+            print("Training epoch {}".format(epoch))
+
+
+Finally, let's add some glue code to launch the parameter server and the trainer
+processes.
+
+
+.. code:: python
+
+    def run_worker(rank, world_size):
+        os.environ['MASTER_ADDR'] = 'localhost'
+        os.environ['MASTER_PORT'] = '29500'
+        if rank == 1:
+            rpc.init_rpc("trainer", rank=rank, world_size=world_size)
+            _run_trainer()
+        else:
+            rpc.init_rpc("ps", rank=rank, world_size=world_size)
+            # parameter server do nothing
+            pass
+
+        # block until all rpcs finish
+        rpc.shutdown()
+
+
+    if __name__=="__main__":
+        world_size = 2
+        mp.spawn(run_worker, args=(world_size, ), nprocs=world_size, join=True)
diff --git a/intermediate_source/scaled_dot_product_attention_tutorial.py b/intermediate_source/scaled_dot_product_attention_tutorial.py
new file mode 100644
index 00000000000..35b1ba7be4e
--- /dev/null
+++ b/intermediate_source/scaled_dot_product_attention_tutorial.py
@@ -0,0 +1,407 @@
+"""
+(Beta) Implementing High-Performance Transformers with Scaled Dot Product Attention (SDPA)
+==========================================================================================
+
+
+**Author:** `Driss Guessous <https://github.com/drisspg>`_
+"""
+
+######################################################################
+# Summary
+# ~~~~~~~~
+#
+# In this tutorial, we want to highlight a new ``torch.nn.functional`` function
+# that can be helpful for implementing transformer architectures. The
+# function is named ``torch.nn.functional.scaled_dot_product_attention``.
+# For detailed description of the function, see the `PyTorch documentation <https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention.html#torch.nn.functional.scaled_dot_product_attention>`__.
+# This function has already been incorporated into ``torch.nn.MultiheadAttention`` and ``torch.nn.TransformerEncoderLayer``.
+#
+# Overview
+# ~~~~~~~~~
+# At a high level, this PyTorch function calculates the
+# scaled dot product attention (SDPA) between query, key, and value according to
+# the definition found in the paper `Attention is all you
+# need <https://arxiv.org/abs/1706.03762>`__. While this function can
+# be written in PyTorch using existing functions, a fused implementation can provide
+# large performance benefits over a naive implementation.
+#
+# Fused implementations
+# ~~~~~~~~~~~~~~~~~~~~~~
+#
+# For CUDA tensor inputs, the function will dispatch into one of the following
+# implementations:
+#
+# * `FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness <https://arxiv.org/abs/2205.14135>`__
+# * `Memory-Efficient Attention <https://github.com/facebookresearch/xformers>`__
+# * A PyTorch implementation defined in C++
+#
+# .. note::
+#
+#   This tutorial requires PyTorch 2.0.0 or later.
+#
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+device = "cuda" if torch.cuda.is_available() else "cpu"
+
+# Example Usage:
+query, key, value = torch.randn(2, 3, 8, device=device), torch.randn(2, 3, 8, device=device), torch.randn(2, 3, 8, device=device)
+F.scaled_dot_product_attention(query, key, value)
+
+
+######################################################################
+# Explicit Dispatcher Control
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# While the function will implicitly dispatch to one of the three
+# implementations, the user can also explicitly control the dispatch via
+# the use of a context manager. This context manager allows users to
+# explicitly disable certain implementations. If a user wants to ensure
+# the function is indeed using the fastest implementation for their
+# specific inputs, the context manager can be used to sweep through
+# measuring performance.
+#
+
+# Lets define a helpful benchmarking function:
+import torch.utils.benchmark as benchmark
+def benchmark_torch_function_in_microseconds(f, *args, **kwargs):
+    t0 = benchmark.Timer(
+        stmt="f(*args, **kwargs)", globals={"args": args, "kwargs": kwargs, "f": f}
+    )
+    return t0.blocked_autorange().mean * 1e6
+
+# Lets define the hyper-parameters of our input
+batch_size = 32
+max_sequence_len = 1024
+num_heads = 32
+embed_dimension = 32
+
+dtype = torch.float16
+
+query = torch.rand(batch_size, num_heads, max_sequence_len, embed_dimension, device=device, dtype=dtype)
+key = torch.rand(batch_size, num_heads, max_sequence_len, embed_dimension, device=device, dtype=dtype)
+value = torch.rand(batch_size, num_heads, max_sequence_len, embed_dimension, device=device, dtype=dtype)
+
+print(f"The default implementation runs in {benchmark_torch_function_in_microseconds(F.scaled_dot_product_attention, query, key, value):.3f} microseconds")
+
+# Lets explore the speed of each of the 3 implementations
+from torch.nn.attention import SDPBackend, sdpa_kernel
+
+
+with sdpa_kernel(SDPBackend.MATH):
+    math_time=benchmark_torch_function_in_microseconds(F.scaled_dot_product_attention, query, key, value)
+    print(f"The math implementation runs in {math_time:.3f} microseconds")
+
+with sdpa_kernel(SDPBackend.FLASH_ATTENTION):
+    try:
+        flash_time=benchmark_torch_function_in_microseconds(F.scaled_dot_product_attention, query, key, value)
+        print(f"The flash attention implementation runs in {flash_time:.3f} microseconds")
+    except RuntimeError:
+        print("FlashAttention is not supported. See warnings for reasons.")
+
+with sdpa_kernel(SDPBackend.EFFICIENT_ATTENTION):
+    try:
+        efficient_time=benchmark_torch_function_in_microseconds(F.scaled_dot_product_attention, query, key, value)
+        print(f"The memory efficient implementation runs in {efficient_time:.3f} microseconds")
+    except RuntimeError:
+        print("EfficientAttention is not supported. See warnings for reasons.")
+
+
+######################################################################
+# Hardware dependence
+# ~~~~~~~~~~~~~~~~~~~
+#
+# Depending on what machine you ran the above cell on and what hardware is
+# available, your results might be different.
+# - If you don’t have a GPU and are running on CPU then with FP32 the context manager
+# will have no effect and all three runs should return similar timings.
+# - Depending on what compute capability your graphics card supports
+# flash attention or memory efficient might have failed.
+
+
+######################################################################
+# Causal Self Attention
+# ~~~~~~~~~~~~~~~~~~~~~
+#
+# Below is an example implementation of a multi-headed causal self
+# attention block inspired by
+# `Andrej Karpathy NanoGPT <https://github.com/karpathy/nanoGPT>`__ repository.
+#
+
+class CausalSelfAttention(nn.Module):
+
+    def __init__(self, num_heads: int, embed_dimension: int, bias: bool=False, is_causal: bool=False, dropout:float=0.0):
+        super().__init__()
+        assert embed_dimension % num_heads == 0
+        # key, query, value projections for all heads, but in a batch
+        self.c_attn = nn.Linear(embed_dimension, 3 * embed_dimension, bias=bias)
+        # output projection
+        self.c_proj = nn.Linear(embed_dimension, embed_dimension, bias=bias)
+        # regularization
+        self.dropout = dropout
+        self.resid_dropout = nn.Dropout(dropout)
+        self.num_heads = num_heads
+        self.embed_dimension = embed_dimension
+        # Perform causal masking
+        self.is_causal = is_causal
+
+    def forward(self, x):
+        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
+        query_projected = self.c_attn(x)
+
+        batch_size = query_projected.size(0)
+        embed_dim = query_projected.size(2)
+        head_dim = embed_dim // (self.num_heads * 3)
+
+        query, key, value = query_projected.chunk(3, -1)
+        query = query.view(batch_size, -1, self.num_heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, self.num_heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, self.num_heads, head_dim).transpose(1, 2)
+
+        if self.training:
+            dropout = self.dropout
+            is_causal = self.is_causal
+        else:
+            dropout = 0.0
+            is_causal = False
+
+        y = F.scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=dropout, is_causal=is_causal)
+        y = y.transpose(1, 2).view(batch_size, -1, self.num_heads * head_dim)
+
+        y = self.resid_dropout(self.c_proj(y))
+        return y
+
+
+num_heads = 8
+heads_per_dim = 64
+embed_dimension = num_heads * heads_per_dim
+dtype = torch.float16
+model = CausalSelfAttention(num_heads=num_heads, embed_dimension=embed_dimension, bias=False, is_causal=True, dropout=0.1).to("cuda").to(dtype).eval()
+print(model)
+
+
+#####################################################################
+# ``NestedTensor`` and Dense tensor support
+# -----------------------------------------
+#
+# SDPA supports both ``NestedTensor`` and Dense tensor inputs. ``NestedTensors`` handle the case where the input is a batch of variable length sequences
+# without needing to pad each sequence to the maximum length in the batch. For more information about ``NestedTensors`` see
+# `torch.nested <https://pytorch.org/docs/stable/nested.html>`__ and `NestedTensors Tutorial <https://pytorch.org/tutorials/prototype/nestedtensor.html>`__.
+#
+
+import random
+def generate_rand_batch(
+    batch_size,
+    max_sequence_len,
+    embed_dimension,
+    pad_percentage=None,
+    dtype=torch.float16,
+    device="cuda",
+):
+    if not pad_percentage:
+        return (
+            torch.randn(
+                batch_size,
+                max_sequence_len,
+                embed_dimension,
+                dtype=dtype,
+                device=device,
+            ),
+            None,
+        )
+    # Random sequence lengths
+    seq_len_list = [
+        int(max_sequence_len * (1 - random.gauss(pad_percentage, 0.01)))
+        for _ in range(batch_size)
+    ]
+    # Make random entry in the batch have max sequence length
+    seq_len_list[random.randint(0, batch_size - 1)] = max_sequence_len
+    return (
+        torch.nested.nested_tensor(
+            [
+                torch.randn(seq_len, embed_dimension,
+                            dtype=dtype, device=device)
+                for seq_len in seq_len_list
+            ]
+        ),
+        seq_len_list,
+    )
+
+random_nt, _ = generate_rand_batch(32, 512, embed_dimension, pad_percentage=0.5, dtype=dtype, device=device)
+random_dense, _ = generate_rand_batch(32, 512, embed_dimension, pad_percentage=None, dtype=dtype, device=device)
+
+# Currently the fused implementations don't support ``NestedTensor`` for training
+model.eval()
+
+with sdpa_kernel(SDPBackend.FLASH_ATTENTION):
+    try:
+        print(f"Random NT runs in {benchmark_torch_function_in_microseconds(model, random_nt):.3f} microseconds")
+        print(f"Random Dense runs in {benchmark_torch_function_in_microseconds(model, random_dense):.3f} microseconds")
+    except RuntimeError:
+        print("FlashAttention is not supported. See warnings for reasons.")
+
+
+######################################################################
+# Using SDPA with ``torch.compile``
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# With the release of PyTorch 2.0, a new feature called
+# ``torch.compile()`` has been introduced, which can provide
+# significant performance improvements over eager mode.
+# Scaled dot product attention is fully composable with ``torch.compile()``.
+# To demonstrate this, let's compile the ``CausalSelfAttention`` module using
+# ``torch.compile()`` and observe the resulting performance improvements.
+#
+
+batch_size = 32
+max_sequence_len = 256
+x = torch.rand(batch_size, max_sequence_len,
+               embed_dimension, device=device, dtype=dtype)
+print(
+    f"The non compiled module runs in  {benchmark_torch_function_in_microseconds(model, x):.3f} microseconds")
+
+
+compiled_model = torch.compile(model)
+# Let's compile it
+compiled_model(x)
+print(
+    f"The compiled module runs in  {benchmark_torch_function_in_microseconds(compiled_model, x):.3f} microseconds")
+
+
+######################################################################
+#
+# The exact execution time is dependent on machine, however the results for mine:
+# The non compiled module runs in  166.616 microseconds
+# The compiled module runs in  166.726 microseconds
+# That is not what we were expecting. Let's dig a little deeper.
+# PyTorch comes with an amazing built-in profiler that you can use to
+# inspect the performance characteristics of your code.
+#
+
+from torch.profiler import profile, record_function, ProfilerActivity
+activities = [ProfilerActivity.CPU]
+if device == 'cuda':
+    activities.append(ProfilerActivity.CUDA)
+
+with profile(activities=activities, record_shapes=False) as prof:
+    with record_function(" Non-Compilied Causal Attention"):
+        for _ in range(25):
+            model(x)
+print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
+
+
+with profile(activities=activities, record_shapes=False) as prof:
+    with record_function("Compiled Causal Attention"):
+        for _ in range(25):
+            compiled_model(x)
+print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
+
+# For even more insights, you can export the trace and use ``chrome://tracing`` to view the results
+#
+# .. code-block:: python
+#
+#    prof.export_chrome_trace("compiled_causal_attention_trace.json").
+
+
+
+
+######################################################################
+# The previous code snippet generates a report of the top 10 PyTorch functions
+# that consumed the most GPU execution time, for both the compiled and non-compiled module.
+# The analysis reveals that the majority of time spent on the GPU is concentrated
+# on the same set of functions for both modules.
+# The reason for this here is that ``torch.compile`` is very good at removing the
+# framework overhead associated with PyTorch. If your model is launching
+# large, efficient CUDA kernels, which in this case ``CausalSelfAttention``
+# is, then the overhead of PyTorch can be hidden.
+#
+# In reality, your module does not normally consist of a singular
+# ``CausalSelfAttention`` block. When experimenting with `Andrej Karpathy NanoGPT <https://github.com/karpathy/nanoGPT>`__ repository, compiling
+# the module took the time per train step from: ``6090.49ms`` to
+# ``3273.17ms``! This was done on commit: ``ae3a8d5`` of NanoGPT training on
+# the Shakespeare dataset.
+#
+
+######################################################################
+# Using SDPA with attn_bias subclasses
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+# As of PyTorch 2.3, we have added a new submodule that contains tensor subclasses.
+# Designed to be used with ``torch.nn.functional.scaled_dot_product_attention``.
+# The module is named ``torch.nn.attention.bias`` and contains the following two
+# utilities for generating causal attention variants:
+#
+# - ``torch.nn.attention.bias.causal_upper_left``
+# - ``torch.nn.attention.bias.causal_lower_right``
+#
+# .. note::
+#    The current argument ``is_causal`` in ``torch.nn.functional.scaled_dot_product_attention``
+#    is the same as using ``torch.nn.attention.bias.causal_upper_left``.
+#
+
+from torch.nn.attention.bias import causal_lower_right, causal_upper_left
+
+batch_size = 32
+sequence_length_q = 2
+sequence_length_kv = 10
+num_heads = 16
+embed_dimension = 32
+
+dtype = torch.float16
+
+query = torch.rand(batch_size, num_heads, sequence_length_q, embed_dimension, device=device, dtype=dtype)
+key = torch.rand(batch_size, num_heads, sequence_length_kv, embed_dimension, device=device, dtype=dtype)
+value = torch.rand(batch_size, num_heads, sequence_length_kv, embed_dimension, device=device, dtype=dtype)
+
+upper_left_bias = causal_upper_left(sequence_length_q, sequence_length_kv)
+lower_right_bias = causal_lower_right(sequence_length_q, sequence_length_kv)
+
+print(type(upper_left_bias))
+print(type(lower_right_bias))
+
+assert type(upper_left_bias) == type(lower_right_bias)
+assert issubclass(type(upper_left_bias), torch.Tensor)
+
+# As you can see from the previous output, are the same type ``torch.nn.attention.bias.CausalBias``
+# and subclass ``torch.Tensor``
+
+# Lets see what these tensors look like
+print(upper_left_bias)
+print(lower_right_bias)
+
+# Upper Left Bias aligns the causal attention mask to the upper left corner of the attention scores matrix.
+# This only has an impact when the attention scores matrix is not square, which is common for decoding use cases.
+# Another way of thinking about this concept is that when you use upper left bias,
+# the 0th token in the query is aligned to the 0th token in the key, while for lower right bias,
+# Assuming the attention score matrix is two dimensional, ``attn_score[0][0]`` is the attention score
+# between the 0th token in the query and the 0th token in the key.
+# For lower right bias, the sequence of q is aligned so that the last token in q is aligned to the last token in k
+# (for example, ``attn_score[-1][-1])`` is all True since the last token in q is at the same position as the last token in k
+# even if the sequence length of q and k are different.
+
+# These objects are intended to be used with sdpa
+out_upper_left = F.scaled_dot_product_attention(query, key, value, upper_left_bias)
+out_lower_right = F.scaled_dot_product_attention(query, key, value, lower_right_bias)
+out_is_causal = F.scaled_dot_product_attention(query, key, value, is_causal=True)
+
+assert torch.allclose(out_upper_left, out_is_causal)
+assert not torch.allclose(out_upper_left, out_lower_right)
+
+# These attention biases should also be compatible with torch.compile
+compiled_sdpa = torch.compile(F.scaled_dot_product_attention, fullgraph=True)
+out_upper_left = compiled_sdpa(query, key, value, upper_left_bias)
+
+######################################################################
+# Conclusion
+# ~~~~~~~~~~~
+#
+# In this tutorial, we have demonstrated the basic usage of
+# ``torch.nn.functional.scaled_dot_product_attention``. We have shown how
+# the ``sdpa_kernel`` context manager can be used to assert a certain
+# implementation is used on GPU. As well, we built a simple
+# ``CausalSelfAttention`` module that works with ``NestedTensor`` and is torch
+# compilable. In the process we have shown how to the profiling tools can
+# be used to explore the performance characteristics of a user defined
+# module.
+#
diff --git a/intermediate_source/seq2seq_translation_tutorial.py b/intermediate_source/seq2seq_translation_tutorial.py
old mode 100644
new mode 100755
index 39e80ef7588..5de4bb4ca3e
--- a/intermediate_source/seq2seq_translation_tutorial.py
+++ b/intermediate_source/seq2seq_translation_tutorial.py
@@ -1,13 +1,23 @@
 # -*- coding: utf-8 -*-
 """
-Translation with a Sequence to Sequence Network and Attention
-*************************************************************
-**Author**: `Sean Robertson <https://github.com/spro/practical-pytorch>`_
+NLP From Scratch: Translation with a Sequence to Sequence Network and Attention
+*******************************************************************************
+**Author**: `Sean Robertson <https://github.com/spro>`_
+
+This tutorials is part of a three-part series:
+
+* `NLP From Scratch: Classifying Names with a Character-Level RNN <https://pytorch.org/tutorials/intermediate/char_rnn_classification_tutorial.html>`__
+* `NLP From Scratch: Generating Names with a Character-Level RNN <https://pytorch.org/tutorials/intermediate/char_rnn_generation_tutorial.html>`__
+* `NLP From Scratch: Translation with a Sequence to Sequence Network and Attention <https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html>`__
+
+This is the third and final tutorial on doing **NLP From Scratch**, where we
+write our own classes and functions to preprocess the data to do our NLP
+modeling tasks.
 
 In this project we will be teaching a neural network to translate from
 French to English.
 
-::
+.. code-block:: sh
 
     [KEY: > input, = target, < output]
 
@@ -30,13 +40,13 @@
 ... to varying degrees of success.
 
 This is made possible by the simple but powerful idea of the `sequence
-to sequence network <http://arxiv.org/abs/1409.3215>`__, in which two
+to sequence network <https://arxiv.org/abs/1409.3215>`__, in which two
 recurrent neural networks work together to transform one sequence to
 another. An encoder network condenses an input sequence into a vector,
 and a decoder network unfolds that vector into a new sequence.
 
 .. figure:: /_static/img/seq-seq-images/seq2seq.png
-   :alt: 
+   :alt:
 
 To improve upon this model we'll use an `attention
 mechanism <https://arxiv.org/abs/1409.0473>`__, which lets the decoder
@@ -47,7 +57,7 @@
 I assume you have at least installed PyTorch, know Python, and
 understand Tensors:
 
--  http://pytorch.org/ For installation instructions
+-  https://pytorch.org/ For installation instructions
 -  :doc:`/beginner/deep_learning_60min_blitz` to get started with PyTorch in general
 -  :doc:`/beginner/pytorch_with_examples` for a wide and deep overview
 -  :doc:`/beginner/former_torchies_tutorial` if you are former Lua Torch user
@@ -57,12 +67,12 @@
 how they work:
 
 -  `Learning Phrase Representations using RNN Encoder-Decoder for
-   Statistical Machine Translation <http://arxiv.org/abs/1406.1078>`__
+   Statistical Machine Translation <https://arxiv.org/abs/1406.1078>`__
 -  `Sequence to Sequence Learning with Neural
-   Networks <http://arxiv.org/abs/1409.3215>`__
+   Networks <https://arxiv.org/abs/1409.3215>`__
 -  `Neural Machine Translation by Jointly Learning to Align and
    Translate <https://arxiv.org/abs/1409.0473>`__
--  `A Neural Conversational Model <http://arxiv.org/abs/1506.05869>`__
+-  `A Neural Conversational Model <https://arxiv.org/abs/1506.05869>`__
 
 You will also find the previous tutorials on
 :doc:`/intermediate/char_rnn_classification_tutorial`
@@ -70,33 +80,23 @@
 helpful as those concepts are very similar to the Encoder and Decoder
 models, respectively.
 
-And for more, read the papers that introduced these topics:
-
--  `Learning Phrase Representations using RNN Encoder-Decoder for
-   Statistical Machine Translation <http://arxiv.org/abs/1406.1078>`__
--  `Sequence to Sequence Learning with Neural
-   Networks <http://arxiv.org/abs/1409.3215>`__
--  `Neural Machine Translation by Jointly Learning to Align and
-   Translate <https://arxiv.org/abs/1409.0473>`__
--  `A Neural Conversational Model <http://arxiv.org/abs/1506.05869>`__
-
-
 **Requirements**
 """
 from __future__ import unicode_literals, print_function, division
 from io import open
 import unicodedata
-import string
 import re
 import random
 
 import torch
 import torch.nn as nn
-from torch.autograd import Variable
 from torch import optim
 import torch.nn.functional as F
 
-use_cuda = torch.cuda.is_available()
+import numpy as np
+from torch.utils.data import TensorDataset, DataLoader, RandomSampler
+
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
 ######################################################################
 # Loading data files
@@ -106,21 +106,21 @@
 # French translation pairs.
 #
 # `This question on Open Data Stack
-# Exchange <http://opendata.stackexchange.com/questions/3888/dataset-of-sentences-translated-into-many-languages>`__
-# pointed me to the open translation site http://tatoeba.org/ which has
-# downloads available at http://tatoeba.org/eng/downloads - and better
+# Exchange <https://opendata.stackexchange.com/questions/3888/dataset-of-sentences-translated-into-many-languages>`__
+# pointed me to the open translation site https://tatoeba.org/ which has
+# downloads available at https://tatoeba.org/eng/downloads - and better
 # yet, someone did the extra work of splitting language pairs into
-# individual text files here: http://www.manythings.org/anki/
+# individual text files here: https://www.manythings.org/anki/
 #
-# The English to French pairs are too big to include in the repo, so
+# The English to French pairs are too big to include in the repository, so
 # download to ``data/eng-fra.txt`` before continuing. The file is a tab
 # separated list of translation pairs:
 #
-# ::
+# .. code-block:: sh
 #
-#     I am cold.    Je suis froid.
+#     I am cold.    J'ai froid.
 #
-# .. Note::
+# .. note::
 #    Download the data from
 #    `here <https://download.pytorch.org/tutorial/data.zip>`_
 #    and extract it to the current directory.
@@ -145,13 +145,12 @@
 # the networks later. To keep track of all this we will use a helper class
 # called ``Lang`` which has word → index (``word2index``) and index → word
 # (``index2word``) dictionaries, as well as a count of each word
-# ``word2count`` to use to later replace rare words.
+# ``word2count`` which will be used to replace rare words later.
 #
 
 SOS_token = 0
 EOS_token = 1
 
-
 class Lang:
     def __init__(self, name):
         self.name = name
@@ -181,7 +180,7 @@ def addWord(self, word):
 #
 
 # Turn a Unicode string to plain ASCII, thanks to
-# http://stackoverflow.com/a/518232/2809427
+# https://stackoverflow.com/a/518232/2809427
 def unicodeToAscii(s):
     return ''.join(
         c for c in unicodedata.normalize('NFD', s)
@@ -189,13 +188,11 @@ def unicodeToAscii(s):
     )
 
 # Lowercase, trim, and remove non-letter characters
-
-
 def normalizeString(s):
     s = unicodeToAscii(s.lower().strip())
     s = re.sub(r"([.!?])", r" \1", s)
-    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
-    return s
+    s = re.sub(r"[^a-zA-Z!?]+", r" ", s)
+    return s.strip()
 
 
 ######################################################################
@@ -241,13 +238,12 @@ def readLangs(lang1, lang2, reverse=False):
 eng_prefixes = (
     "i am ", "i m ",
     "he is", "he s ",
-    "she is", "she s",
+    "she is", "she s ",
     "you are", "you re ",
     "we are", "we re ",
     "they are", "they re "
 )
 
-
 def filterPair(p):
     return len(p[0].split(' ')) < MAX_LENGTH and \
         len(p[1].split(' ')) < MAX_LENGTH and \
@@ -280,7 +276,6 @@ def prepareData(lang1, lang2, reverse=False):
     print(output_lang.name, output_lang.n_words)
     return input_lang, output_lang, pairs
 
-
 input_lang, output_lang, pairs = prepareData('eng', 'fra', True)
 print(random.choice(pairs))
 
@@ -292,7 +287,7 @@ def prepareData(lang1, lang2, reverse=False):
 # A Recurrent Neural Network, or RNN, is a network that operates on a
 # sequence and uses its own output as input for subsequent steps.
 #
-# A `Sequence to Sequence network <http://arxiv.org/abs/1409.3215>`__, or
+# A `Sequence to Sequence network <https://arxiv.org/abs/1409.3215>`__, or
 # seq2seq network, or `Encoder Decoder
 # network <https://arxiv.org/pdf/1406.1078v3.pdf>`__, is a model
 # consisting of two RNNs called the encoder and decoder. The encoder reads
@@ -307,10 +302,10 @@ def prepareData(lang1, lang2, reverse=False):
 # length and order, which makes it ideal for translation between two
 # languages.
 #
-# Consider the sentence "Je ne suis pas le chat noir" → "I am not the
-# black cat". Most of the words in the input sentence have a direct
+# Consider the sentence ``Je ne suis pas le chat noir`` → ``I am not the
+# black cat``. Most of the words in the input sentence have a direct
 # translation in the output sentence, but are in slightly different
-# orders, e.g. "chat noir" and "black cat". Because of the "ne/pas"
+# orders, e.g. ``chat noir`` and ``black cat``. Because of the ``ne/pas``
 # construction there is also one more word in the input sentence. It would
 # be difficult to produce a correct translation directly from the sequence
 # of input words.
@@ -336,28 +331,19 @@ def prepareData(lang1, lang2, reverse=False):
 #
 
 class EncoderRNN(nn.Module):
-    def __init__(self, input_size, hidden_size, n_layers=1):
+    def __init__(self, input_size, hidden_size, dropout_p=0.1):
         super(EncoderRNN, self).__init__()
-        self.n_layers = n_layers
         self.hidden_size = hidden_size
 
         self.embedding = nn.Embedding(input_size, hidden_size)
-        self.gru = nn.GRU(hidden_size, hidden_size)
+        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
+        self.dropout = nn.Dropout(dropout_p)
 
-    def forward(self, input, hidden):
-        embedded = self.embedding(input).view(1, 1, -1)
-        output = embedded
-        for i in range(self.n_layers):
-            output, hidden = self.gru(output, hidden)
+    def forward(self, input):
+        embedded = self.dropout(self.embedding(input))
+        output, hidden = self.gru(embedded)
         return output, hidden
 
-    def initHidden(self):
-        result = Variable(torch.zeros(1, 1, self.hidden_size))
-        if use_cuda:
-            return result.cuda()
-        else:
-            return result
-
 ######################################################################
 # The Decoder
 # -----------
@@ -387,30 +373,40 @@ def initHidden(self):
 #
 
 class DecoderRNN(nn.Module):
-    def __init__(self, hidden_size, output_size, n_layers=1):
+    def __init__(self, hidden_size, output_size):
         super(DecoderRNN, self).__init__()
-        self.n_layers = n_layers
-        self.hidden_size = hidden_size
-
         self.embedding = nn.Embedding(output_size, hidden_size)
-        self.gru = nn.GRU(hidden_size, hidden_size)
+        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
         self.out = nn.Linear(hidden_size, output_size)
-        self.softmax = nn.LogSoftmax()
-
-    def forward(self, input, hidden):
-        output = self.embedding(input).view(1, 1, -1)
-        for i in range(self.n_layers):
-            output = F.relu(output)
-            output, hidden = self.gru(output, hidden)
-        output = self.softmax(self.out(output[0]))
-        return output, hidden
 
-    def initHidden(self):
-        result = Variable(torch.zeros(1, 1, self.hidden_size))
-        if use_cuda:
-            return result.cuda()
-        else:
-            return result
+    def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):
+        batch_size = encoder_outputs.size(0)
+        decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(SOS_token)
+        decoder_hidden = encoder_hidden
+        decoder_outputs = []
+
+        for i in range(MAX_LENGTH):
+            decoder_output, decoder_hidden  = self.forward_step(decoder_input, decoder_hidden)
+            decoder_outputs.append(decoder_output)
+
+            if target_tensor is not None:
+                # Teacher forcing: Feed the target as the next input
+                decoder_input = target_tensor[:, i].unsqueeze(1) # Teacher forcing
+            else:
+                # Without teacher forcing: use its own predictions as the next input
+                _, topi = decoder_output.topk(1)
+                decoder_input = topi.squeeze(-1).detach()  # detach from history as input
+
+        decoder_outputs = torch.cat(decoder_outputs, dim=1)
+        decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)
+        return decoder_outputs, decoder_hidden, None # We return `None` for consistency in the training loop
+
+    def forward_step(self, input, hidden):
+        output = self.embedding(input)
+        output = F.relu(output)
+        output, hidden = self.gru(output, hidden)
+        output = self.out(output)
+        return output, hidden
 
 ######################################################################
 # I encourage you to train and observe the results of this model, but to
@@ -423,7 +419,7 @@ def initHidden(self):
 # Attention Decoder
 # ^^^^^^^^^^^^^^^^^
 #
-# If only the context vector is passed betweeen the encoder and decoder,
+# If only the context vector is passed between the encoder and decoder,
 # that single vector carries the burden of encoding the entire sentence.
 #
 # Attention allows the decoder network to "focus" on a different part of
@@ -449,48 +445,87 @@ def initHidden(self):
 #    :alt:
 #
 #
+# Bahdanau attention, also known as additive attention, is a commonly used
+# attention mechanism in sequence-to-sequence models, particularly in neural
+# machine translation tasks. It was introduced by Bahdanau et al. in their
+# paper titled `Neural Machine Translation by Jointly Learning to Align and Translate <https://arxiv.org/pdf/1409.0473.pdf>`__.
+# This attention mechanism employs a learned alignment model to compute attention
+# scores between the encoder and decoder hidden states. It utilizes a feed-forward
+# neural network to calculate alignment scores.
+#
+# However, there are alternative attention mechanisms available, such as Luong attention,
+# which computes attention scores by taking the dot product between the decoder hidden
+# state and the encoder hidden states. It does not involve the non-linear transformation
+# used in Bahdanau attention.
+#
+# In this tutorial, we will be using Bahdanau attention. However, it would be a valuable
+# exercise to explore modifying the attention mechanism to use Luong attention.
+
+class BahdanauAttention(nn.Module):
+    def __init__(self, hidden_size):
+        super(BahdanauAttention, self).__init__()
+        self.Wa = nn.Linear(hidden_size, hidden_size)
+        self.Ua = nn.Linear(hidden_size, hidden_size)
+        self.Va = nn.Linear(hidden_size, 1)
+
+    def forward(self, query, keys):
+        scores = self.Va(torch.tanh(self.Wa(query) + self.Ua(keys)))
+        scores = scores.squeeze(2).unsqueeze(1)
+
+        weights = F.softmax(scores, dim=-1)
+        context = torch.bmm(weights, keys)
+
+        return context, weights
 
 class AttnDecoderRNN(nn.Module):
-    def __init__(self, hidden_size, output_size, n_layers=1, dropout_p=0.1, max_length=MAX_LENGTH):
+    def __init__(self, hidden_size, output_size, dropout_p=0.1):
         super(AttnDecoderRNN, self).__init__()
-        self.hidden_size = hidden_size
-        self.output_size = output_size
-        self.n_layers = n_layers
-        self.dropout_p = dropout_p
-        self.max_length = max_length
-
-        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
-        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
-        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
-        self.dropout = nn.Dropout(self.dropout_p)
-        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
-        self.out = nn.Linear(self.hidden_size, self.output_size)
-
-    def forward(self, input, hidden, encoder_output, encoder_outputs):
-        embedded = self.embedding(input).view(1, 1, -1)
-        embedded = self.dropout(embedded)
-
-        attn_weights = F.softmax(
-            self.attn(torch.cat((embedded[0], hidden[0]), 1)))
-        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
-                                 encoder_outputs.unsqueeze(0))
-
-        output = torch.cat((embedded[0], attn_applied[0]), 1)
-        output = self.attn_combine(output).unsqueeze(0)
-
-        for i in range(self.n_layers):
-            output = F.relu(output)
-            output, hidden = self.gru(output, hidden)
-
-        output = F.log_softmax(self.out(output[0]))
-        return output, hidden, attn_weights
+        self.embedding = nn.Embedding(output_size, hidden_size)
+        self.attention = BahdanauAttention(hidden_size)
+        self.gru = nn.GRU(2 * hidden_size, hidden_size, batch_first=True)
+        self.out = nn.Linear(hidden_size, output_size)
+        self.dropout = nn.Dropout(dropout_p)
 
-    def initHidden(self):
-        result = Variable(torch.zeros(1, 1, self.hidden_size))
-        if use_cuda:
-            return result.cuda()
-        else:
-            return result
+    def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):
+        batch_size = encoder_outputs.size(0)
+        decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(SOS_token)
+        decoder_hidden = encoder_hidden
+        decoder_outputs = []
+        attentions = []
+
+        for i in range(MAX_LENGTH):
+            decoder_output, decoder_hidden, attn_weights = self.forward_step(
+                decoder_input, decoder_hidden, encoder_outputs
+            )
+            decoder_outputs.append(decoder_output)
+            attentions.append(attn_weights)
+
+            if target_tensor is not None:
+                # Teacher forcing: Feed the target as the next input
+                decoder_input = target_tensor[:, i].unsqueeze(1) # Teacher forcing
+            else:
+                # Without teacher forcing: use its own predictions as the next input
+                _, topi = decoder_output.topk(1)
+                decoder_input = topi.squeeze(-1).detach()  # detach from history as input
+
+        decoder_outputs = torch.cat(decoder_outputs, dim=1)
+        decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)
+        attentions = torch.cat(attentions, dim=1)
+
+        return decoder_outputs, decoder_hidden, attentions
+
+
+    def forward_step(self, input, hidden, encoder_outputs):
+        embedded =  self.dropout(self.embedding(input))
+
+        query = hidden.permute(1, 0, 2)
+        context, attn_weights = self.attention(query, encoder_outputs)
+        input_gru = torch.cat((embedded, context), dim=2)
+
+        output, hidden = self.gru(input_gru, hidden)
+        output = self.out(output)
+
+        return output, hidden, attn_weights
 
 
 ######################################################################
@@ -514,21 +549,37 @@ def initHidden(self):
 def indexesFromSentence(lang, sentence):
     return [lang.word2index[word] for word in sentence.split(' ')]
 
-
-def variableFromSentence(lang, sentence):
+def tensorFromSentence(lang, sentence):
     indexes = indexesFromSentence(lang, sentence)
     indexes.append(EOS_token)
-    result = Variable(torch.LongTensor(indexes).view(-1, 1))
-    if use_cuda:
-        return result.cuda()
-    else:
-        return result
+    return torch.tensor(indexes, dtype=torch.long, device=device).view(1, -1)
+
+def tensorsFromPair(pair):
+    input_tensor = tensorFromSentence(input_lang, pair[0])
+    target_tensor = tensorFromSentence(output_lang, pair[1])
+    return (input_tensor, target_tensor)
 
+def get_dataloader(batch_size):
+    input_lang, output_lang, pairs = prepareData('eng', 'fra', True)
 
-def variablesFromPair(pair):
-    input_variable = variableFromSentence(input_lang, pair[0])
-    target_variable = variableFromSentence(output_lang, pair[1])
-    return (input_variable, target_variable)
+    n = len(pairs)
+    input_ids = np.zeros((n, MAX_LENGTH), dtype=np.int32)
+    target_ids = np.zeros((n, MAX_LENGTH), dtype=np.int32)
+
+    for idx, (inp, tgt) in enumerate(pairs):
+        inp_ids = indexesFromSentence(input_lang, inp)
+        tgt_ids = indexesFromSentence(output_lang, tgt)
+        inp_ids.append(EOS_token)
+        tgt_ids.append(EOS_token)
+        input_ids[idx, :len(inp_ids)] = inp_ids
+        target_ids[idx, :len(tgt_ids)] = tgt_ids
+
+    train_data = TensorDataset(torch.LongTensor(input_ids).to(device),
+                               torch.LongTensor(target_ids).to(device))
+
+    train_sampler = RandomSampler(train_data)
+    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
+    return input_lang, output_lang, train_dataloader
 
 
 ######################################################################
@@ -538,13 +589,13 @@ def variablesFromPair(pair):
 # To train we run the input sentence through the encoder, and keep track
 # of every output and the latest hidden state. Then the decoder is given
 # the ``<SOS>`` token as its first input, and the last hidden state of the
-# decoder as its first hidden state.
+# encoder as its first hidden state.
 #
 # "Teacher forcing" is the concept of using the real target outputs as
 # each next input, instead of using the decoder's guess as the next input.
 # Using teacher forcing causes it to converge faster but `when the trained
 # network is exploited, it may exhibit
-# instability <http://minds.jacobs-university.de/sites/default/files/uploads/papers/ESNTutorialRev.pdf>`__.
+# instability <http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.378.4095&rep=rep1&type=pdf>`__.
 #
 # You can observe outputs of teacher-forced networks that read with
 # coherent grammar but wander far from the correct translation -
@@ -558,64 +609,31 @@ def variablesFromPair(pair):
 # ``teacher_forcing_ratio`` up to use more of it.
 #
 
-teacher_forcing_ratio = 0.5
-
-
-def train(input_variable, target_variable, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
-    encoder_hidden = encoder.initHidden()
+def train_epoch(dataloader, encoder, decoder, encoder_optimizer,
+          decoder_optimizer, criterion):
 
-    encoder_optimizer.zero_grad()
-    decoder_optimizer.zero_grad()
+    total_loss = 0
+    for data in dataloader:
+        input_tensor, target_tensor = data
 
-    input_length = input_variable.size()[0]
-    target_length = target_variable.size()[0]
-    
-    encoder_outputs = Variable(torch.zeros(max_length, encoder.hidden_size))
-    encoder_outputs = encoder_outputs.cuda() if use_cuda else encoder_outputs
-   
-    loss = 0
+        encoder_optimizer.zero_grad()
+        decoder_optimizer.zero_grad()
 
-    for ei in range(input_length):
-        encoder_output, encoder_hidden = encoder(
-            input_variable[ei], encoder_hidden)
-        encoder_outputs[ei] = encoder_output[0][0]
+        encoder_outputs, encoder_hidden = encoder(input_tensor)
+        decoder_outputs, _, _ = decoder(encoder_outputs, encoder_hidden, target_tensor)
 
-    decoder_input = Variable(torch.LongTensor([[SOS_token]]))
-    decoder_input = decoder_input.cuda() if use_cuda else decoder_input
-    
-    decoder_hidden = encoder_hidden
+        loss = criterion(
+            decoder_outputs.view(-1, decoder_outputs.size(-1)),
+            target_tensor.view(-1)
+        )
+        loss.backward()
 
-    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
-
-    if use_teacher_forcing:
-        # Teacher forcing: Feed the target as the next input
-        for di in range(target_length):
-            decoder_output, decoder_hidden, decoder_attention = decoder(
-                decoder_input, decoder_hidden, encoder_output, encoder_outputs)
-            loss += criterion(decoder_output[0], target_variable[di])
-            decoder_input = target_variable[di]  # Teacher forcing
-
-    else:
-        # Without teacher forcing: use its own predictions as the next input
-        for di in range(target_length):
-            decoder_output, decoder_hidden, decoder_attention = decoder(
-                decoder_input, decoder_hidden, encoder_output, encoder_outputs)
-            topv, topi = decoder_output.data.topk(1)
-            ni = topi[0][0]
-            
-            decoder_input = Variable(torch.LongTensor([[ni]]))
-            decoder_input = decoder_input.cuda() if use_cuda else decoder_input
-            
-            loss += criterion(decoder_output[0], target_variable[di])
-            if ni == EOS_token:
-                break
+        encoder_optimizer.step()
+        decoder_optimizer.step()
 
-    loss.backward()
+        total_loss += loss.item()
 
-    encoder_optimizer.step()
-    decoder_optimizer.step()
-
-    return loss.data[0] / target_length
+    return total_loss / len(dataloader)
 
 
 ######################################################################
@@ -626,13 +644,11 @@ def train(input_variable, target_variable, encoder, decoder, encoder_optimizer,
 import time
 import math
 
-
 def asMinutes(s):
     m = math.floor(s / 60)
     s -= m * 60
     return '%dm %ds' % (m, s)
 
-
 def timeSince(since, percent):
     now = time.time()
     s = now - since
@@ -653,42 +669,35 @@ def timeSince(since, percent):
 # of examples, time so far, estimated time) and average loss.
 #
 
-def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
+def train(train_dataloader, encoder, decoder, n_epochs, learning_rate=0.001,
+               print_every=100, plot_every=100):
     start = time.time()
     plot_losses = []
     print_loss_total = 0  # Reset every print_every
     plot_loss_total = 0  # Reset every plot_every
 
-    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
-    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
-    training_pairs = [variablesFromPair(random.choice(pairs))
-                      for i in range(n_iters)]
+    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
+    decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
     criterion = nn.NLLLoss()
 
-    for iter in range(1, n_iters + 1):
-        training_pair = training_pairs[iter - 1]
-        input_variable = training_pair[0]
-        target_variable = training_pair[1]
- 
-        loss = train(input_variable, target_variable, encoder,
-                     decoder, encoder_optimizer, decoder_optimizer, criterion)
+    for epoch in range(1, n_epochs + 1):
+        loss = train_epoch(train_dataloader, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
         print_loss_total += loss
         plot_loss_total += loss
 
-        if iter % print_every == 0:
+        if epoch % print_every == 0:
             print_loss_avg = print_loss_total / print_every
             print_loss_total = 0
-            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
-                                         iter, iter / n_iters * 100, print_loss_avg))
+            print('%s (%d %d%%) %.4f' % (timeSince(start, epoch / n_epochs),
+                                        epoch, epoch / n_epochs * 100, print_loss_avg))
 
-        if iter % plot_every == 0:
+        if epoch % plot_every == 0:
             plot_loss_avg = plot_loss_total / plot_every
             plot_losses.append(plot_loss_avg)
             plot_loss_total = 0
 
     showPlot(plot_losses)
 
-
 ######################################################################
 # Plotting results
 # ----------------
@@ -698,10 +707,10 @@ def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, lear
 #
 
 import matplotlib.pyplot as plt
+plt.switch_backend('agg')
 import matplotlib.ticker as ticker
 import numpy as np
 
-
 def showPlot(points):
     plt.figure()
     fig, ax = plt.subplots()
@@ -722,43 +731,23 @@ def showPlot(points):
 # attention outputs for display later.
 #
 
-def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
-    input_variable = variableFromSentence(input_lang, sentence)
-    input_length = input_variable.size()[0]
-    encoder_hidden = encoder.initHidden()
-
-    encoder_outputs = Variable(torch.zeros(max_length, encoder.hidden_size))
-    encoder_outputs = encoder_outputs.cuda() if use_cuda else encoder_outputs
-
-    for ei in range(input_length):
-        encoder_output, encoder_hidden = encoder(input_variable[ei],
-                                                 encoder_hidden)
-        encoder_outputs[ei] = encoder_outputs[ei] + encoder_output[0][0]
-
-    decoder_input = Variable(torch.LongTensor([[SOS_token]]))  # SOS
-    decoder_input = decoder_input.cuda() if use_cuda else decoder_input
-
-    decoder_hidden = encoder_hidden
+def evaluate(encoder, decoder, sentence, input_lang, output_lang):
+    with torch.no_grad():
+        input_tensor = tensorFromSentence(input_lang, sentence)
 
-    decoded_words = []
-    decoder_attentions = torch.zeros(max_length, max_length)
+        encoder_outputs, encoder_hidden = encoder(input_tensor)
+        decoder_outputs, decoder_hidden, decoder_attn = decoder(encoder_outputs, encoder_hidden)
 
-    for di in range(max_length):
-        decoder_output, decoder_hidden, decoder_attention = decoder(
-            decoder_input, decoder_hidden, encoder_output, encoder_outputs)
-        decoder_attentions[di] = decoder_attention.data
-        topv, topi = decoder_output.data.topk(1)
-        ni = topi[0][0]
-        if ni == EOS_token:
-            decoded_words.append('<EOS>')
-            break
-        else:
-            decoded_words.append(output_lang.index2word[ni])
-        
-        decoder_input = Variable(torch.LongTensor([[ni]]))
-        decoder_input = decoder_input.cuda() if use_cuda else decoder_input
+        _, topi = decoder_outputs.topk(1)
+        decoded_ids = topi.squeeze()
 
-    return decoded_words, decoder_attentions[:di + 1]
+        decoded_words = []
+        for idx in decoded_ids:
+            if idx.item() == EOS_token:
+                decoded_words.append('<EOS>')
+                break
+            decoded_words.append(output_lang.index2word[idx.item()])
+    return decoded_words, decoder_attn
 
 
 ######################################################################
@@ -771,7 +760,7 @@ def evaluateRandomly(encoder, decoder, n=10):
         pair = random.choice(pairs)
         print('>', pair[0])
         print('=', pair[1])
-        output_words, attentions = evaluate(encoder, decoder, pair[0])
+        output_words, _ = evaluate(encoder, decoder, pair[0], input_lang, output_lang)
         output_sentence = ' '.join(output_words)
         print('<', output_sentence)
         print('')
@@ -782,7 +771,7 @@ def evaluateRandomly(encoder, decoder, n=10):
 # =======================
 #
 # With all these helper functions in place (it looks like extra work, but
-# it's easier to run multiple experiments easier) we can actually
+# it makes it easier to run multiple experiments) we can actually
 # initialize a network and start training.
 #
 # Remember that the input sentences were heavily filtered. For this small
@@ -790,27 +779,28 @@ def evaluateRandomly(encoder, decoder, n=10):
 # single GRU layer. After about 40 minutes on a MacBook CPU we'll get some
 # reasonable results.
 #
-# .. Note:: 
+# .. note::
 #    If you run this notebook you can train, interrupt the kernel,
 #    evaluate, and continue training later. Comment out the lines where the
 #    encoder and decoder are initialized and run ``trainIters`` again.
 #
 
-hidden_size = 256
-encoder1 = EncoderRNN(input_lang.n_words, hidden_size)
-attn_decoder1 = AttnDecoderRNN(hidden_size, output_lang.n_words,
-                               1, dropout_p=0.1)
+hidden_size = 128
+batch_size = 32
+
+input_lang, output_lang, train_dataloader = get_dataloader(batch_size)
 
-if use_cuda:
-    encoder1 = encoder1.cuda()
-    attn_decoder1 = attn_decoder1.cuda()
+encoder = EncoderRNN(input_lang.n_words, hidden_size).to(device)
+decoder = AttnDecoderRNN(hidden_size, output_lang.n_words).to(device)
 
-trainIters(encoder1, attn_decoder1, 75000, print_every=5000)
+train(train_dataloader, encoder, decoder, 80, print_every=5, plot_every=5)
 
 ######################################################################
 #
-
-evaluateRandomly(encoder1, attn_decoder1)
+# Set dropout layers to ``eval`` mode
+encoder.eval()
+decoder.eval()
+evaluateRandomly(encoder, decoder)
 
 
 ######################################################################
@@ -823,25 +813,14 @@ def evaluateRandomly(encoder, decoder, n=10):
 # at each time step.
 #
 # You could simply run ``plt.matshow(attentions)`` to see attention output
-# displayed as a matrix, with the columns being input steps and rows being
-# output steps:
-#
-
-output_words, attentions = evaluate(
-    encoder1, attn_decoder1, "je suis trop froid .")
-plt.matshow(attentions.numpy())
-
-
-######################################################################
-# For a better viewing experience we will do the extra work of adding axes
-# and labels:
+# displayed as a matrix. For a better viewing experience we will do the
+# extra work of adding axes and labels:
 #
 
 def showAttention(input_sentence, output_words, attentions):
-    # Set up figure with colorbar
     fig = plt.figure()
     ax = fig.add_subplot(111)
-    cax = ax.matshow(attentions.numpy(), cmap='bone')
+    cax = ax.matshow(attentions.cpu().numpy(), cmap='bone')
     fig.colorbar(cax)
 
     # Set up axes
@@ -857,20 +836,19 @@ def showAttention(input_sentence, output_words, attentions):
 
 
 def evaluateAndShowAttention(input_sentence):
-    output_words, attentions = evaluate(
-        encoder1, attn_decoder1, input_sentence)
+    output_words, attentions = evaluate(encoder, decoder, input_sentence, input_lang, output_lang)
     print('input =', input_sentence)
     print('output =', ' '.join(output_words))
-    showAttention(input_sentence, output_words, attentions)
+    showAttention(input_sentence, output_words, attentions[0, :len(output_words), :])
 
 
-evaluateAndShowAttention("elle a cinq ans de moins que moi .")
+evaluateAndShowAttention('il n est pas aussi grand que son pere')
 
-evaluateAndShowAttention("elle est trop petit .")
+evaluateAndShowAttention('je suis trop fatigue pour conduire')
 
-evaluateAndShowAttention("je ne crains pas de mourir .")
+evaluateAndShowAttention('je suis desole si c est une question idiote')
 
-evaluateAndShowAttention("c est un jeune directeur plein de talent .")
+evaluateAndShowAttention('je suis reellement fiere de vous')
 
 
 ######################################################################
@@ -884,8 +862,8 @@ def evaluateAndShowAttention(input_sentence):
 #    -  Chat → Response
 #    -  Question → Answer
 #
-# -  Replace the embedding pre-trained word embeddings such as word2vec or
-#    GloVe
+# -  Replace the embeddings with pretrained word embeddings such as ``word2vec`` or
+#    ``GloVe``
 # -  Try with more layers, more hidden units, and more sentences. Compare
 #    the training time and results.
 # -  If you use a translation file where pairs have two of the same phrase
diff --git a/intermediate_source/spatial_transformer_tutorial.py b/intermediate_source/spatial_transformer_tutorial.py
new file mode 100644
index 00000000000..99efe41b39b
--- /dev/null
+++ b/intermediate_source/spatial_transformer_tutorial.py
@@ -0,0 +1,257 @@
+# -*- coding: utf-8 -*-
+"""
+Spatial Transformer Networks Tutorial
+=====================================
+**Author**: `Ghassen HAMROUNI <https://github.com/GHamrouni>`_
+
+.. figure:: /_static/img/stn/FSeq.png
+
+In this tutorial, you will learn how to augment your network using
+a visual attention mechanism called spatial transformer
+networks. You can read more about the spatial transformer
+networks in the `DeepMind paper <https://arxiv.org/abs/1506.02025>`__
+
+Spatial transformer networks are a generalization of differentiable
+attention to any spatial transformation. Spatial transformer networks
+(STN for short) allow a neural network to learn how to perform spatial
+transformations on the input image in order to enhance the geometric
+invariance of the model.
+For example, it can crop a region of interest, scale and correct
+the orientation of an image. It can be a useful mechanism because CNNs
+are not invariant to rotation and scale and more general affine
+transformations.
+
+One of the best things about STN is the ability to simply plug it into
+any existing CNN with very little modification.
+"""
+# License: BSD
+# Author: Ghassen Hamrouni
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+import torchvision
+from torchvision import datasets, transforms
+import matplotlib.pyplot as plt
+import numpy as np
+
+plt.ion()   # interactive mode
+
+######################################################################
+# Loading the data
+# ----------------
+#
+# In this post we experiment with the classic MNIST dataset. Using a
+# standard convolutional network augmented with a spatial transformer
+# network.
+
+from six.moves import urllib
+opener = urllib.request.build_opener()
+opener.addheaders = [('User-agent', 'Mozilla/5.0')]
+urllib.request.install_opener(opener)
+
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+# Training dataset
+train_loader = torch.utils.data.DataLoader(
+    datasets.MNIST(root='.', train=True, download=True,
+                   transform=transforms.Compose([
+                       transforms.ToTensor(),
+                       transforms.Normalize((0.1307,), (0.3081,))
+                   ])), batch_size=64, shuffle=True, num_workers=4)
+# Test dataset
+test_loader = torch.utils.data.DataLoader(
+    datasets.MNIST(root='.', train=False, transform=transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Normalize((0.1307,), (0.3081,))
+    ])), batch_size=64, shuffle=True, num_workers=4)
+
+######################################################################
+# Depicting spatial transformer networks
+# --------------------------------------
+#
+# Spatial transformer networks boils down to three main components :
+#
+# -  The localization network is a regular CNN which regresses the
+#    transformation parameters. The transformation is never learned
+#    explicitly from this dataset, instead the network learns automatically
+#    the spatial transformations that enhances the global accuracy.
+# -  The grid generator generates a grid of coordinates in the input
+#    image corresponding to each pixel from the output image.
+# -  The sampler uses the parameters of the transformation and applies
+#    it to the input image.
+#
+# .. figure:: /_static/img/stn/stn-arch.png
+#
+# .. note::
+#    We need the latest version of PyTorch that contains
+#    affine_grid and grid_sample modules.
+#
+
+
+class Net(nn.Module):
+    def __init__(self):
+        super(Net, self).__init__()
+        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
+        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
+        self.conv2_drop = nn.Dropout2d()
+        self.fc1 = nn.Linear(320, 50)
+        self.fc2 = nn.Linear(50, 10)
+
+        # Spatial transformer localization-network
+        self.localization = nn.Sequential(
+            nn.Conv2d(1, 8, kernel_size=7),
+            nn.MaxPool2d(2, stride=2),
+            nn.ReLU(True),
+            nn.Conv2d(8, 10, kernel_size=5),
+            nn.MaxPool2d(2, stride=2),
+            nn.ReLU(True)
+        )
+
+        # Regressor for the 3 * 2 affine matrix
+        self.fc_loc = nn.Sequential(
+            nn.Linear(10 * 3 * 3, 32),
+            nn.ReLU(True),
+            nn.Linear(32, 3 * 2)
+        )
+
+        # Initialize the weights/bias with identity transformation
+        self.fc_loc[2].weight.data.zero_()
+        self.fc_loc[2].bias.data.copy_(torch.tensor([1, 0, 0, 0, 1, 0], dtype=torch.float))
+
+    # Spatial transformer network forward function
+    def stn(self, x):
+        xs = self.localization(x)
+        xs = xs.view(-1, 10 * 3 * 3)
+        theta = self.fc_loc(xs)
+        theta = theta.view(-1, 2, 3)
+
+        grid = F.affine_grid(theta, x.size())
+        x = F.grid_sample(x, grid)
+
+        return x
+
+    def forward(self, x):
+        # transform the input
+        x = self.stn(x)
+
+        # Perform the usual forward pass
+        x = F.relu(F.max_pool2d(self.conv1(x), 2))
+        x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
+        x = x.view(-1, 320)
+        x = F.relu(self.fc1(x))
+        x = F.dropout(x, training=self.training)
+        x = self.fc2(x)
+        return F.log_softmax(x, dim=1)
+
+
+model = Net().to(device)
+
+######################################################################
+# Training the model
+# ------------------
+#
+# Now, let's use the SGD algorithm to train the model. The network is
+# learning the classification task in a supervised way. In the same time
+# the model is learning STN automatically in an end-to-end fashion.
+
+
+optimizer = optim.SGD(model.parameters(), lr=0.01)
+
+
+def train(epoch):
+    model.train()
+    for batch_idx, (data, target) in enumerate(train_loader):
+        data, target = data.to(device), target.to(device)
+
+        optimizer.zero_grad()
+        output = model(data)
+        loss = F.nll_loss(output, target)
+        loss.backward()
+        optimizer.step()
+        if batch_idx % 500 == 0:
+            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
+                epoch, batch_idx * len(data), len(train_loader.dataset),
+                100. * batch_idx / len(train_loader), loss.item()))
+#
+# A simple test procedure to measure the STN performances on MNIST.
+#
+
+
+def test():
+    with torch.no_grad():
+        model.eval()
+        test_loss = 0
+        correct = 0
+        for data, target in test_loader:
+            data, target = data.to(device), target.to(device)
+            output = model(data)
+
+            # sum up batch loss
+            test_loss += F.nll_loss(output, target, size_average=False).item()
+            # get the index of the max log-probability
+            pred = output.max(1, keepdim=True)[1]
+            correct += pred.eq(target.view_as(pred)).sum().item()
+
+        test_loss /= len(test_loader.dataset)
+        print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'
+              .format(test_loss, correct, len(test_loader.dataset),
+                      100. * correct / len(test_loader.dataset)))
+
+######################################################################
+# Visualizing the STN results
+# ---------------------------
+#
+# Now, we will inspect the results of our learned visual attention
+# mechanism.
+#
+# We define a small helper function in order to visualize the
+# transformations while training.
+
+
+def convert_image_np(inp):
+    """Convert a Tensor to numpy image."""
+    inp = inp.numpy().transpose((1, 2, 0))
+    mean = np.array([0.485, 0.456, 0.406])
+    std = np.array([0.229, 0.224, 0.225])
+    inp = std * inp + mean
+    inp = np.clip(inp, 0, 1)
+    return inp
+
+# We want to visualize the output of the spatial transformers layer
+# after the training, we visualize a batch of input images and
+# the corresponding transformed batch using STN.
+
+
+def visualize_stn():
+    with torch.no_grad():
+        # Get a batch of training data
+        data = next(iter(test_loader))[0].to(device)
+
+        input_tensor = data.cpu()
+        transformed_input_tensor = model.stn(data).cpu()
+
+        in_grid = convert_image_np(
+            torchvision.utils.make_grid(input_tensor))
+
+        out_grid = convert_image_np(
+            torchvision.utils.make_grid(transformed_input_tensor))
+
+        # Plot the results side-by-side
+        f, axarr = plt.subplots(1, 2)
+        axarr[0].imshow(in_grid)
+        axarr[0].set_title('Dataset Images')
+
+        axarr[1].imshow(out_grid)
+        axarr[1].set_title('Transformed Images')
+
+for epoch in range(1, 20 + 1):
+    train(epoch)
+    test()
+
+# Visualize the STN transformation on some input batch
+visualize_stn()
+
+plt.ioff()
+plt.show()
diff --git a/intermediate_source/speech_recognition_pipeline_tutorial.rst b/intermediate_source/speech_recognition_pipeline_tutorial.rst
new file mode 100644
index 00000000000..4ec497b3bd8
--- /dev/null
+++ b/intermediate_source/speech_recognition_pipeline_tutorial.rst
@@ -0,0 +1,10 @@
+Speech Recognition with Wav2Vec2
+================================
+
+This tutorial has been moved to https://pytorch.org/audio/stable/tutorials/speech_recognition_pipeline_tutorial.html
+
+It will redirect in 3 seconds.
+
+.. raw:: html
+
+   <meta http-equiv="Refresh" content="3; url='https://pytorch.org/audio/stable/tutorials/speech_recognition_pipeline_tutorial.html'" />
diff --git a/intermediate_source/tensorboard_profiler_tutorial.py b/intermediate_source/tensorboard_profiler_tutorial.py
new file mode 100644
index 00000000000..3782ced18d9
--- /dev/null
+++ b/intermediate_source/tensorboard_profiler_tutorial.py
@@ -0,0 +1,509 @@
+"""
+PyTorch Profiler With TensorBoard
+====================================
+This tutorial demonstrates how to use TensorBoard plugin with PyTorch Profiler
+to detect performance bottlenecks of the model.
+
+.. warning::
+   The TensorBoard integration with the PyTorch profiler is now
+   deprecated. Instead, use Perfetto or the Chrome trace to
+   view ``trace.json`` files. After
+   `generating a trace <https://pytorch.org/tutorials/recipes/recipes/profiler_recipe.html#using-tracing-functionality>`__,
+   simply drag the ``trace.json`` into `Perfetto UI <https://ui.perfetto.dev/>`__
+   or ``chrome://tracing`` to visualize your profile.
+
+Introduction
+------------
+PyTorch 1.8 includes an updated profiler API capable of
+recording the CPU side operations as well as the CUDA kernel launches on the GPU side.
+The profiler can visualize this information
+in TensorBoard Plugin and provide analysis of the performance bottlenecks.
+
+In this tutorial, we will use a simple Resnet model to demonstrate how to
+use TensorBoard plugin to analyze model performance.
+
+Setup
+-----
+To install ``torch`` and ``torchvision`` use the following command:
+
+.. code-block::
+
+   pip install torch torchvision
+
+
+"""
+
+
+######################################################################
+# Steps
+# -----
+#
+# 1. Prepare the data and model
+# 2. Use profiler to record execution events
+# 3. Run the profiler
+# 4. Use TensorBoard to view results and analyze model performance
+# 5. Improve performance with the help of profiler
+# 6. Analyze performance with other advanced features
+# 7. Additional Practices: Profiling PyTorch on AMD GPUs 
+#
+# 1. Prepare the data and model
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# First, import all necessary libraries:
+#
+
+import torch
+import torch.nn
+import torch.optim
+import torch.profiler
+import torch.utils.data
+import torchvision.datasets
+import torchvision.models
+import torchvision.transforms as T
+
+######################################################################
+# Then prepare the input data. For this tutorial, we use the CIFAR10 dataset.
+# Transform it to the desired format and use ``DataLoader`` to load each batch.
+
+transform = T.Compose(
+    [T.Resize(224),
+     T.ToTensor(),
+     T.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
+train_set = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
+train_loader = torch.utils.data.DataLoader(train_set, batch_size=32, shuffle=True)
+
+######################################################################
+# Next, create Resnet model, loss function, and optimizer objects.
+# To run on GPU, move model and loss to GPU device.
+
+device = torch.device("cuda:0")
+model = torchvision.models.resnet18(weights='IMAGENET1K_V1').cuda(device)
+criterion = torch.nn.CrossEntropyLoss().cuda(device)
+optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
+model.train()
+
+
+######################################################################
+# Define the training step for each batch of input data.
+
+def train(data):
+    inputs, labels = data[0].to(device=device), data[1].to(device=device)
+    outputs = model(inputs)
+    loss = criterion(outputs, labels)
+    optimizer.zero_grad()
+    loss.backward()
+    optimizer.step()
+
+
+######################################################################
+# 2. Use profiler to record execution events
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# The profiler is enabled through the context manager and accepts several parameters,
+# some of the most useful are:
+#
+# - ``schedule`` - callable that takes step (int) as a single parameter
+#   and returns the profiler action to perform at each step.
+#
+#   In this example with ``wait=1, warmup=1, active=3, repeat=1``,
+#   profiler will skip the first step/iteration,
+#   start warming up on the second,
+#   record the following three iterations,
+#   after which the trace will become available and on_trace_ready (when set) is called.
+#   In total, the cycle repeats once. Each cycle is called a "span" in TensorBoard plugin.
+#
+#   During ``wait`` steps, the profiler is disabled.
+#   During ``warmup`` steps, the profiler starts tracing but the results are discarded.
+#   This is for reducing the profiling overhead.
+#   The overhead at the beginning of profiling is high and easy to bring skew to the profiling result.
+#   During ``active`` steps, the profiler works and records events.
+# - ``on_trace_ready`` - callable that is called at the end of each cycle;
+#   In this example we use ``torch.profiler.tensorboard_trace_handler`` to generate result files for TensorBoard.
+#   After profiling, result files will be saved into the ``./log/resnet18`` directory.
+#   Specify this directory as a ``logdir`` parameter to analyze profile in TensorBoard.
+# - ``record_shapes`` - whether to record shapes of the operator inputs.
+# - ``profile_memory`` - Track tensor memory allocation/deallocation. Note, for old version of pytorch with version
+#   before 1.10, if you suffer long profiling time, please disable it or upgrade to new version.
+# - ``with_stack`` - Record source information (file and line number) for the ops.
+#   If the TensorBoard is launched in VS Code (`reference <https://code.visualstudio.com/docs/datascience/pytorch-support#_tensorboard-integration>`_),
+#   clicking a stack frame will navigate to the specific code line.
+
+with torch.profiler.profile(
+        schedule=torch.profiler.schedule(wait=1, warmup=1, active=3, repeat=1),
+        on_trace_ready=torch.profiler.tensorboard_trace_handler('./log/resnet18'),
+        record_shapes=True,
+        profile_memory=True,
+        with_stack=True
+) as prof:
+    for step, batch_data in enumerate(train_loader):
+        prof.step()  # Need to call this at each step to notify profiler of steps' boundary.
+        if step >= 1 + 1 + 3:
+            break
+        train(batch_data)
+
+######################################################################
+# Alternatively, the following non-context manager start/stop is supported as well.
+prof = torch.profiler.profile(
+        schedule=torch.profiler.schedule(wait=1, warmup=1, active=3, repeat=1),
+        on_trace_ready=torch.profiler.tensorboard_trace_handler('./log/resnet18'),
+        record_shapes=True,
+        with_stack=True)
+prof.start()
+for step, batch_data in enumerate(train_loader):
+    prof.step()
+    if step >= 1 + 1 + 3:
+        break
+    train(batch_data)
+prof.stop()
+
+######################################################################
+# 3. Run the profiler
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# Run the above code. The profiling result will be saved under ``./log/resnet18`` directory.
+
+
+######################################################################
+# 4. Use TensorBoard to view results and analyze model performance
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# .. note::
+#     TensorBoard Plugin support has been deprecated, so some of these functions may not
+#     work as previously. Please take a look at the replacement, `HTA <https://github.com/pytorch/kineto/tree/main#holistic-trace-analysis>`_.
+#
+# Install PyTorch Profiler TensorBoard Plugin.
+#
+# .. code-block::
+#
+#     pip install torch_tb_profiler
+#
+
+######################################################################
+# Launch the TensorBoard.
+#
+# .. code-block::
+#
+#     tensorboard --logdir=./log
+#
+
+######################################################################
+# Open the TensorBoard profile URL in Google Chrome browser or Microsoft Edge browser (**Safari is not supported**).
+#
+# .. code-block::
+#
+#     http://localhost:6006/#pytorch_profiler
+#
+
+######################################################################
+# You could see Profiler plugin page as shown below.
+#
+# - Overview
+# .. image:: ../../_static/img/profiler_overview1.png
+#    :scale: 25 %
+#
+# The overview shows a high-level summary of model performance.
+#
+# The "GPU Summary" panel shows the GPU configuration, GPU usage and Tensor Cores usage.
+# In this example, the GPU Utilization is low.
+# The details of these metrics are `here <https://github.com/pytorch/kineto/blob/main/tb_plugin/docs/gpu_utilization.md>`_.
+#
+# The "Step Time Breakdown" shows distribution of time spent in each step over different categories of execution.
+# In this example, you can see the ``DataLoader`` overhead is significant.
+#
+# The bottom "Performance Recommendation" uses the profiling data
+# to automatically highlight likely bottlenecks,
+# and gives you actionable optimization suggestions.
+#
+# You can change the view page in left "Views" dropdown list.
+#
+# .. image:: ../../_static/img/profiler_views_list.png
+#    :alt:
+#
+#
+# - Operator view
+# The operator view displays the performance of every PyTorch operator
+# that is executed either on the host or device.
+#
+# .. image:: ../../_static/img/profiler_operator_view.png
+#    :scale: 25 %
+# The "Self" duration does not include its child operators’ time.
+# The "Total" duration includes its child operators’ time.
+#
+# - View call stack
+# Click the ``View Callstack`` of an operator, the operators with same name but different call stacks will be shown.
+# Then click a ``View Callstack`` in this sub-table, the call stack frames will be shown.
+#
+# .. image:: ../../_static/img/profiler_callstack.png
+#    :scale: 25 %
+#
+# If the TensorBoard is launched inside VS Code
+# (`Launch Guide <https://devblogs.microsoft.com/python/python-in-visual-studio-code-february-2021-release/#tensorboard-integration>`_),
+# clicking a call stack frame will navigate to the specific code line.
+#
+# .. image:: ../../_static/img/profiler_vscode.png
+#    :scale: 25 %
+#
+#
+# - Kernel view
+# The GPU kernel view shows all kernels’ time spent on GPU.
+#
+# .. image:: ../../_static/img/profiler_kernel_view.png
+#    :scale: 25 %
+# Tensor Cores Used:
+# Whether this kernel uses Tensor Cores.
+#
+# Mean Blocks per SM:
+# Blocks per SM = Blocks of this kernel / SM number of this GPU.
+# If this number is less than 1, it indicates the GPU multiprocessors are not fully utilized.
+# "Mean Blocks per SM" is weighted average of all runs of this kernel name, using each run’s duration as weight.
+#
+# Mean Est. Achieved Occupancy:
+# Est. Achieved Occupancy is defined in this column’s tooltip.
+# For most cases such as memory bandwidth bounded kernels, the higher the better.
+# "Mean Est. Achieved Occupancy" is weighted average of all runs of this kernel name,
+# using each run’s duration as weight.
+#
+# - Trace view
+# The trace view shows timeline of profiled operators and GPU kernels.
+# You can select it to see details as below.
+#
+# .. image:: ../../_static/img/profiler_trace_view1.png
+#    :scale: 25 %
+#
+# You can move the graph and zoom in/out with the help of right side toolbar.
+# And keyboard can also be used to zoom and move around inside the timeline.
+# The ‘w’ and ‘s’ keys zoom in centered around the mouse,
+# and the ‘a’ and ‘d’ keys move the timeline left and right.
+# You can hit these keys multiple times until you see a readable representation.
+#
+# If a backward operator's "Incoming Flow" field is with value "forward correspond to backward",
+# you can click the text to get its launching forward operator.
+#
+# .. image:: ../../_static/img/profiler_trace_view_fwd_bwd.png
+#    :scale: 25 %
+#
+# In this example, we can see the event prefixed with ``enumerate(DataLoader)`` costs a lot of time.
+# And during most of this period, the GPU is idle.
+# Because this function is loading data and transforming data on host side,
+# during which the GPU resource is wasted.
+
+
+######################################################################
+# 5. Improve performance with the help of profiler
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# At the bottom of "Overview" page, the suggestion in "Performance Recommendation" hints the bottleneck is ``DataLoader``.
+# The PyTorch ``DataLoader`` uses single process by default.
+# User could enable multi-process data loading by setting the parameter ``num_workers``.
+# `Here <https://pytorch.org/docs/stable/data.html#single-and-multi-process-data-loading>`_ is more details.
+#
+# In this example, we follow the "Performance Recommendation" and set ``num_workers`` as below,
+# pass a different name such as ``./log/resnet18_4workers`` to ``tensorboard_trace_handler``, and run it again.
+#
+# .. code-block::
+#
+#     train_loader = torch.utils.data.DataLoader(train_set, batch_size=32, shuffle=True, num_workers=4)
+#
+
+######################################################################
+# Then let’s choose the recently profiled run in left "Runs" dropdown list.
+#
+# .. image:: ../../_static/img/profiler_overview2.png
+#    :scale: 25 %
+#
+# From the above view, we can find the step time is reduced to about 76ms comparing with previous run's 132ms,
+# and the time reduction of ``DataLoader`` mainly contributes.
+#
+# .. image:: ../../_static/img/profiler_trace_view2.png
+#    :scale: 25 %
+#
+# From the above view, we can see that the runtime of ``enumerate(DataLoader)`` is reduced,
+# and the GPU utilization is increased.
+
+######################################################################
+# 6. Analyze performance with other advanced features
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# - Memory view
+# To profile memory, ``profile_memory`` must be set to ``True`` in arguments of ``torch.profiler.profile``.
+#
+# You can try it by using existing example on Azure
+#
+# .. code-block::
+#
+#     pip install azure-storage-blob
+#     tensorboard --logdir=https://torchtbprofiler.blob.core.windows.net/torchtbprofiler/demo/memory_demo_1_10
+#
+# The profiler records all memory allocation/release events and allocator's internal state during profiling.
+# The memory view consists of three components as shown in the following.
+#
+# .. image:: ../../_static/img/profiler_memory_view.png
+#    :scale: 25 %
+#
+# The components are memory curve graph, memory events table and memory statistics table, from top to bottom, respectively.
+#
+# The memory type could be selected in "Device" selection box.
+# For example, "GPU0" means the following table only shows each operator's memory usage on GPU 0, not including CPU or other GPUs.
+#
+# The memory curve shows the trends of memory consumption. The "Allocated" curve shows the total memory that is actually
+# in use, e.g., tensors. In PyTorch, caching mechanism is employed in CUDA allocator and some other allocators. The
+# "Reserved" curve shows the total memory that is reserved by the allocator. You can left click and drag on the graph
+# to select events in the desired range:
+#
+# .. image:: ../../_static/img/profiler_memory_curve_selecting.png
+#    :scale: 25 %
+#
+# After selection, the three components will be updated for the restricted time range, so that you can gain more
+# information about it. By repeating this process, you can zoom into a very fine-grained detail. Right click on the graph
+# will reset the graph to the initial state.
+#
+# .. image:: ../../_static/img/profiler_memory_curve_single.png
+#    :scale: 25 %
+#
+# In the memory events table, the allocation and release events are paired into one entry. The "operator" column shows
+# the immediate ATen operator that is causing the allocation. Notice that in PyTorch, ATen operators commonly use
+# ``aten::empty`` to allocate memory. For example, ``aten::ones`` is implemented as ``aten::empty`` followed by an
+# ``aten::fill_``. Solely display the operator name as ``aten::empty`` is of little help. It will be shown as
+# ``aten::ones (aten::empty)`` in this special case. The "Allocation Time", "Release Time" and "Duration"
+# columns' data might be missing if the event occurs outside of the time range. 
+#
+# In the memory statistics table, the "Size Increase" column sums up all allocation size and minus all the memory
+# release size, that is, the net increase of memory usage after this operator. The "Self Size Increase" column is
+# similar to "Size Increase", but it does not count children operators' allocation. With regards to ATen operators'
+# implementation detail, some operators might call other operators, so memory allocations can happen at any level of the
+# call stack. That says, "Self Size Increase" only count the memory usage increase at current level of call stack.
+# Finally, the "Allocation Size" column sums up all allocation without considering the memory release.
+#
+# - Distributed view
+# The plugin now supports distributed view on profiling DDP with NCCL/GLOO as backend.
+#
+# You can try it by using existing example on Azure:
+#
+# .. code-block::
+#
+#     pip install azure-storage-blob
+#     tensorboard --logdir=https://torchtbprofiler.blob.core.windows.net/torchtbprofiler/demo/distributed_bert
+#
+# .. image:: ../../_static/img/profiler_distributed_view.png
+#    :scale: 25 %
+#
+# The "Computation/Communication Overview" shows computation/communication ratio and their overlapping degree.
+# From this view, User can figure out load balance issue among workers.
+# For example, if the computation + overlapping time of one worker is much larger than others,
+# there may be a problem of load balance or this worker may be a straggler.
+#
+# The "Synchronizing/Communication Overview" shows the efficiency of communication.
+# "Data Transfer Time" is the time for actual data exchanging.
+# "Synchronizing Time" is the time for waiting and synchronizing with other workers.
+#
+# If one worker’s "Synchronizing Time" is much shorter than that of other workers’,
+# this worker may be a straggler which may have more computation workload than other workers’.
+#
+# The "Communication Operations Stats" summarizes the detailed statistics of all communication ops in each worker.
+
+######################################################################
+# 7. Additional Practices: Profiling PyTorch on AMD GPUs
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+#  
+# The AMD ROCm Platform is an open-source software stack designed for GPU computation, consisting of drivers, development tools, and APIs. 
+# We can run the above mentioned steps on AMD GPUs. In this section, we will use Docker to install the ROCm base development image
+# before installing PyTorch.
+
+
+######################################################################
+# For the purpose of example, let's create a directory called ``profiler_tutorial``, and save the code in **Step 1** as ``test_cifar10.py`` in this directory. 
+# 
+# .. code-block::
+#
+#      mkdir ~/profiler_tutorial
+#      cd profiler_tutorial
+#      vi test_cifar10.py
+
+
+######################################################################
+# At the time of this writing, the Stable(``2.1.1``) Linux version of PyTorch on ROCm Platform is `ROCm 5.6 <https://pytorch.org/get-started/locally/>`_. 
+#
+#
+# - Obtain a base Docker image with the correct user-space ROCm version installed from `Docker Hub <https://hub.docker.com/repository/docker/rocm/dev-ubuntu-20.04>`_.
+#
+# It is ``rocm/dev-ubuntu-20.04:5.6``.
+#
+# - Start the ROCm base Docker container:
+#
+#
+# .. code-block::
+#
+#     docker run -it --network=host --device=/dev/kfd --device=/dev/dri --group-add=video --ipc=host --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --shm-size 8G -v ~/profiler_tutorial:/profiler_tutorial rocm/dev-ubuntu-20.04:5.6
+#
+#
+# - Inside the container, install any dependencies needed for installing the wheels package.
+#
+# .. code-block::
+#
+#     sudo apt update
+#     sudo apt install libjpeg-dev python3-dev -y
+#     pip3 install wheel setuptools
+#     sudo apt install python-is-python3 
+#
+#
+# - Install the wheels:
+#
+# .. code-block::
+# 
+#     pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm5.6
+#
+#
+# - Install the ``torch_tb_profiler``, and then, run the Python file ``test_cifar10.py``:
+# 
+# .. code-block::
+#
+#     pip install torch_tb_profiler
+#     cd /profiler_tutorial
+#     python test_cifar10.py
+#
+#     
+# Now, we have all the data needed to view in TensorBoard:
+# 
+# .. code-block::
+#
+#      tensorboard --logdir=./log
+#
+# Choose different views as described in **Step 4**. For example, below is the **Operator** View:
+#
+# .. image:: ../../_static/img/profiler_rocm_tensorboard_operartor_view.png
+#    :scale: 25 %
+
+
+######################################################################
+# At the time this section is written, **Trace** view does not work and it displays nothing. You can work around by typing ``chrome://tracing`` in your Chrome Browser.
+#
+# 
+# - Copy the ``trace.json`` file under ``~/profiler_tutorial/log/resnet18`` directory to the Windows.  
+# You may need to copy the file by using ``scp`` if the file is located in a remote location. 
+# 
+# - Click **Load** button to load the trace JSON file from the ``chrome://tracing`` page in the browser. 
+#
+# .. image:: ../../_static/img/profiler_rocm_chrome_trace_view.png
+#    :scale: 25 %
+
+
+######################################################################
+# As mentioned previously, you can move the graph and zoom in and out.
+# You can also use keyboard to zoom and move around inside the timeline.
+# The ``w`` and ``s`` keys zoom in centered around the mouse,
+# and the ``a`` and ``d`` keys move the timeline left and right.
+# You can hit these keys multiple times until you see a readable representation.
+
+
+
+######################################################################
+# Learn More
+# ----------
+#
+# Take a look at the following documents to continue your learning,
+# and feel free to open an issue `here <https://github.com/pytorch/kineto/issues>`_.
+#
+# -  `PyTorch TensorBoard Profiler Github <https://github.com/pytorch/kineto/tree/master/tb_plugin>`_
+# -  `torch.profiler API <https://pytorch.org/docs/master/profiler.html>`_
+# -  `HTA <https://github.com/pytorch/kineto/tree/main#holistic-trace-analysis>`_
diff --git a/intermediate_source/tensorboard_tutorial.rst b/intermediate_source/tensorboard_tutorial.rst
new file mode 100644
index 00000000000..d599dd098c5
--- /dev/null
+++ b/intermediate_source/tensorboard_tutorial.rst
@@ -0,0 +1,401 @@
+Visualizing Models, Data, and Training with TensorBoard
+=======================================================
+
+In the `60 Minute Blitz <https://pytorch.org/tutorials/beginner/deep_learning_60min_blitz.html>`_,
+we show you how to load in data,
+feed it through a model we define as a subclass of ``nn.Module``,
+train this model on training data, and test it on test data.
+To see what's happening, we print out some statistics as the model
+is training to get a sense for whether training is progressing.
+However, we can do much better than that: PyTorch integrates with
+TensorBoard, a tool designed for visualizing the results of neural
+network training runs. This tutorial illustrates some of its
+functionality, using the
+`Fashion-MNIST dataset <https://github.com/zalandoresearch/fashion-mnist>`__
+which can be read into PyTorch using `torchvision.datasets`.
+
+In this tutorial, we'll learn how to:
+
+    1. Read in data and with appropriate transforms (nearly identical to the prior tutorial).
+    2. Set up TensorBoard.
+    3. Write to TensorBoard.
+    4. Inspect a model architecture using TensorBoard.
+    5. Use TensorBoard to create interactive versions of the visualizations we created in last tutorial, with less code
+
+Specifically, on point #5, we'll see:
+
+    * A couple of ways to inspect our training data
+    * How to track our model's performance as it trains
+    * How to assess our model's performance once it is trained.
+
+We'll begin with similar boilerplate code as in the `CIFAR-10 tutorial <https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html>`__:
+
+.. code:: python
+
+    # imports
+    import matplotlib.pyplot as plt
+    import numpy as np
+
+    import torch
+    import torchvision
+    import torchvision.transforms as transforms
+
+    import torch.nn as nn
+    import torch.nn.functional as F
+    import torch.optim as optim
+
+    # transforms
+    transform = transforms.Compose(
+        [transforms.ToTensor(),
+        transforms.Normalize((0.5,), (0.5,))])
+
+    # datasets
+    trainset = torchvision.datasets.FashionMNIST('./data',
+        download=True,
+        train=True,
+        transform=transform)
+    testset = torchvision.datasets.FashionMNIST('./data',
+        download=True,
+        train=False,
+        transform=transform)
+
+    # dataloaders
+    trainloader = torch.utils.data.DataLoader(trainset, batch_size=4, shuffle=True)
+
+    testloader = torch.utils.data.DataLoader(testset, batch_size=4, shuffle=False)
+
+    # constant for classes
+    classes = ('T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat',
+            'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle Boot')
+
+    # helper function to show an image
+    # (used in the `plot_classes_preds` function below)
+    def matplotlib_imshow(img, one_channel=False):
+        if one_channel:
+            img = img.mean(dim=0)
+        img = img / 2 + 0.5     # unnormalize
+        npimg = img.numpy()
+        if one_channel:
+            plt.imshow(npimg, cmap="Greys")
+        else:
+            plt.imshow(np.transpose(npimg, (1, 2, 0)))
+
+We'll define a similar model architecture from that tutorial, making only
+minor modifications to account for the fact that the images are now
+one channel instead of three and 28x28 instead of 32x32:
+
+.. code:: python
+
+    class Net(nn.Module):
+        def __init__(self):
+            super(Net, self).__init__()
+            self.conv1 = nn.Conv2d(1, 6, 5)
+            self.pool = nn.MaxPool2d(2, 2)
+            self.conv2 = nn.Conv2d(6, 16, 5)
+            self.fc1 = nn.Linear(16 * 4 * 4, 120)
+            self.fc2 = nn.Linear(120, 84)
+            self.fc3 = nn.Linear(84, 10)
+
+        def forward(self, x):
+            x = self.pool(F.relu(self.conv1(x)))
+            x = self.pool(F.relu(self.conv2(x)))
+            x = x.view(-1, 16 * 4 * 4)
+            x = F.relu(self.fc1(x))
+            x = F.relu(self.fc2(x))
+            x = self.fc3(x)
+            return x
+
+
+    net = Net()
+
+We'll define the same ``optimizer`` and ``criterion`` from before:
+
+.. code:: python
+
+    criterion = nn.CrossEntropyLoss()
+    optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
+
+1. TensorBoard setup
+~~~~~~~~~~~~~~~~~~~~~
+
+Now we'll set up TensorBoard, importing ``tensorboard`` from ``torch.utils`` and defining a
+``SummaryWriter``, our key object for writing information to TensorBoard.
+
+.. code:: python
+
+    from torch.utils.tensorboard import SummaryWriter
+
+    # default `log_dir` is "runs" - we'll be more specific here
+    writer = SummaryWriter('runs/fashion_mnist_experiment_1')
+
+Note that this line alone creates a ``runs/fashion_mnist_experiment_1``
+folder.
+
+2. Writing to TensorBoard
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Now let's write an image to our TensorBoard - specifically, a grid -
+using `make_grid <https://pytorch.org/vision/stable/utils.html#torchvision.utils.make_grid>`__.
+
+.. code:: python
+
+    # get some random training images
+    dataiter = iter(trainloader)
+    images, labels = next(dataiter)
+
+    # create grid of images
+    img_grid = torchvision.utils.make_grid(images)
+
+    # show images
+    matplotlib_imshow(img_grid, one_channel=True)
+
+    # write to tensorboard
+    writer.add_image('four_fashion_mnist_images', img_grid)
+
+Now running
+
+::
+
+    PYTHONWARNINGS="ignore:pkg_resources is deprecated as an API:UserWarning" tensorboard --logdir=runs
+
+from the command line and then navigating to `http://localhost:6006 <http://localhost:6006>`_
+should show the following.
+
+.. image:: ../../_static/img/tensorboard_first_view.png
+
+Now you know how to use TensorBoard! This example, however, could be
+done in a Jupyter Notebook - where TensorBoard really excels is in
+creating interactive visualizations. We'll cover one of those next,
+and several more by the end of the tutorial.
+
+3. Inspect the model using TensorBoard
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+One of TensorBoard's strengths is its ability to visualize complex model
+structures. Let's visualize the model we built.
+
+.. code:: python
+
+    writer.add_graph(net, images)
+    writer.close()
+
+Now upon refreshing TensorBoard you should see a "Graphs" tab that
+looks like this:
+
+.. image:: ../../_static/img/tensorboard_model_viz.png
+
+Go ahead and double click on "Net" to see it expand, seeing a
+detailed view of the individual operations that make up the model.
+
+TensorBoard has a very handy feature for visualizing high dimensional
+data such as image data in a lower dimensional space; we'll cover this
+next.
+
+4. Adding a "Projector" to TensorBoard
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+We can visualize the lower dimensional representation of higher
+dimensional data via the `add_embedding <https://pytorch.org/docs/stable/tensorboard.html#torch.utils.tensorboard.writer.SummaryWriter.add_embedding>`__ method
+
+.. code:: python
+
+    # helper function
+    def select_n_random(data, labels, n=100):
+        '''
+        Selects n random datapoints and their corresponding labels from a dataset
+        '''
+        assert len(data) == len(labels)
+
+        perm = torch.randperm(len(data))
+        return data[perm][:n], labels[perm][:n]
+
+    # select random images and their target indices
+    images, labels = select_n_random(trainset.data, trainset.targets)
+
+    # get the class labels for each image
+    class_labels = [classes[lab] for lab in labels]
+
+    # log embeddings
+    features = images.view(-1, 28 * 28)
+    writer.add_embedding(features,
+                        metadata=class_labels,
+                        label_img=images.unsqueeze(1))
+    writer.close()
+
+Now in the "Projector" tab of TensorBoard, you can see these 100
+images - each of which is 784 dimensional - projected down into three
+dimensional space. Furthermore, this is interactive: you can click
+and drag to rotate the three dimensional projection. Finally, a couple
+of tips to make the visualization easier to see: select "color: label"
+on the top left, as well as enabling "night mode", which will make the
+images easier to see since their background is white:
+
+.. image:: ../../_static/img/tensorboard_projector.png
+
+Now we've thoroughly inspected our data, let's show how TensorBoard
+can make tracking model training and evaluation clearer, starting with
+training.
+
+5. Tracking model training with TensorBoard
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+In the previous example, we simply *printed* the model's running loss
+every 2000 iterations. Now, we'll instead log the running loss to
+TensorBoard, along with a view into the predictions the model is
+making via the ``plot_classes_preds`` function.
+
+.. code:: python
+
+    # helper functions
+
+    def images_to_probs(net, images):
+        '''
+        Generates predictions and corresponding probabilities from a trained
+        network and a list of images
+        '''
+        output = net(images)
+        # convert output probabilities to predicted class
+        _, preds_tensor = torch.max(output, 1)
+        preds = np.squeeze(preds_tensor.numpy())
+        return preds, [F.softmax(el, dim=0)[i].item() for i, el in zip(preds, output)]
+
+
+    def plot_classes_preds(net, images, labels):
+        '''
+        Generates matplotlib Figure using a trained network, along with images
+        and labels from a batch, that shows the network's top prediction along
+        with its probability, alongside the actual label, coloring this
+        information based on whether the prediction was correct or not.
+        Uses the "images_to_probs" function.
+        '''
+        preds, probs = images_to_probs(net, images)
+        # plot the images in the batch, along with predicted and true labels
+        fig = plt.figure(figsize=(12, 48))
+        for idx in np.arange(4):
+            ax = fig.add_subplot(1, 4, idx+1, xticks=[], yticks=[])
+            matplotlib_imshow(images[idx], one_channel=True)
+            ax.set_title("{0}, {1:.1f}%\n(label: {2})".format(
+                classes[preds[idx]],
+                probs[idx] * 100.0,
+                classes[labels[idx]]),
+                        color=("green" if preds[idx]==labels[idx].item() else "red"))
+        return fig
+
+Finally, let's train the model using the same model training code from
+the prior tutorial, but writing results to TensorBoard every 1000
+batches instead of printing to console; this is done using the
+`add_scalar <https://pytorch.org/docs/stable/tensorboard.html#torch.utils.tensorboard.writer.SummaryWriter.add_scalar>`__
+function.
+
+In addition, as we train, we'll generate an image showing the model's
+predictions vs. the actual results on the four images included in that
+batch.
+
+.. code:: python
+
+    running_loss = 0.0
+    for epoch in range(1):  # loop over the dataset multiple times
+
+        for i, data in enumerate(trainloader, 0):
+
+            # get the inputs; data is a list of [inputs, labels]
+            inputs, labels = data
+
+            # zero the parameter gradients
+            optimizer.zero_grad()
+
+            # forward + backward + optimize
+            outputs = net(inputs)
+            loss = criterion(outputs, labels)
+            loss.backward()
+            optimizer.step()
+
+            running_loss += loss.item()
+            if i % 1000 == 999:    # every 1000 mini-batches...
+
+                # ...log the running loss
+                writer.add_scalar('training loss',
+                                running_loss / 1000,
+                                epoch * len(trainloader) + i)
+
+                # ...log a Matplotlib Figure showing the model's predictions on a
+                # random mini-batch
+                writer.add_figure('predictions vs. actuals',
+                                plot_classes_preds(net, inputs, labels),
+                                global_step=epoch * len(trainloader) + i)
+                running_loss = 0.0
+    print('Finished Training')
+
+You can now look at the scalars tab to see the running loss plotted
+over the 15,000 iterations of training:
+
+.. image:: ../../_static/img/tensorboard_scalar_runs.png
+
+In addition, we can look at the predictions the model made on
+arbitrary batches throughout learning. See the "Images" tab and scroll
+down under the "predictions vs. actuals" visualization to see this;
+this shows us that, for example, after just 3000 training iterations,
+the model was already able to distinguish between visually distinct
+classes such as shirts, sneakers, and coats, though it isn't as
+confident as it becomes later on in training:
+
+.. image:: ../../_static/img/tensorboard_images.png
+
+In the prior tutorial, we looked at per-class accuracy once the model
+had been trained; here, we'll use TensorBoard to plot precision-recall
+curves (good explanation
+`here <https://www.scikit-yb.org/en/latest/api/classifier/prcurve.html>`__)
+for each class.
+
+6. Assessing trained models with TensorBoard
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code:: python
+
+    # 1. gets the probability predictions in a test_size x num_classes Tensor
+    # 2. gets the preds in a test_size Tensor
+    # takes ~10 seconds to run
+    class_probs = []
+    class_label = []
+    with torch.no_grad():
+        for data in testloader:
+            images, labels = data
+            output = net(images)
+            class_probs_batch = [F.softmax(el, dim=0) for el in output]
+
+            class_probs.append(class_probs_batch)
+            class_label.append(labels)
+
+    test_probs = torch.cat([torch.stack(batch) for batch in class_probs])
+    test_label = torch.cat(class_label)
+
+    # helper function
+    def add_pr_curve_tensorboard(class_index, test_probs, test_label, global_step=0):
+        '''
+        Takes in a "class_index" from 0 to 9 and plots the corresponding
+        precision-recall curve
+        '''
+        tensorboard_truth = test_label == class_index
+        tensorboard_probs = test_probs[:, class_index]
+
+        writer.add_pr_curve(classes[class_index],
+                            tensorboard_truth,
+                            tensorboard_probs,
+                            global_step=global_step)
+        writer.close()
+
+    # plot all the pr curves
+    for i in range(len(classes)):
+        add_pr_curve_tensorboard(i, test_probs, test_label)
+
+You will now see a "PR Curves" tab that contains the precision-recall
+curves for each class. Go ahead and poke around; you'll see that on
+some classes the model has nearly 100% "area under the curve",
+whereas on others this area is lower:
+
+.. image:: ../../_static/img/tensorboard_pr_curves.png
+
+And that's an intro to TensorBoard and PyTorch's integration with it.
+Of course, you could do everything TensorBoard does in your Jupyter
+Notebook, but with TensorBoard, you gets visuals that are interactive
+by default.
diff --git a/intermediate_source/text_to_speech_with_torchaudio.rst b/intermediate_source/text_to_speech_with_torchaudio.rst
new file mode 100644
index 00000000000..bbb6d7f272d
--- /dev/null
+++ b/intermediate_source/text_to_speech_with_torchaudio.rst
@@ -0,0 +1,10 @@
+Text-to-speech with Tacotron2
+=============================
+
+This tutorial has been moved to https://pytorch.org/audio/stable/tutorials/tacotron2_pipeline_tutorial.html
+
+It will redirect in 3 seconds.
+
+.. raw:: html
+
+   <meta http-equiv="Refresh" content="3; url='https://pytorch.org/audio/stable/tutorials/tacotron2_pipeline_tutorial.html'" />
diff --git a/intermediate_source/torch_compile_conv_bn_fuser.py b/intermediate_source/torch_compile_conv_bn_fuser.py
new file mode 100644
index 00000000000..e057d145499
--- /dev/null
+++ b/intermediate_source/torch_compile_conv_bn_fuser.py
@@ -0,0 +1,292 @@
+# -*- coding: utf-8 -*-
+"""
+Building a Convolution/Batch Norm fuser with torch.compile
+===========================================================
+
+**Author:** `Horace He <https://github.com/chillee>`_, `Will Feng <https://github.com/yf225>`_
+
+.. grid:: 2
+
+    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn
+       :class-card: card-prerequisites
+
+       * How to register custom fusion patterns with torch.compile's pattern matcher
+
+    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites
+       :class-card: card-prerequisites
+
+       * PyTorch v2.7.0
+
+.. note::
+   This optimization only works for models in inference mode (i.e. ``model.eval()``).
+   However, torch.compile's pattern matching system works for both training and inference.
+
+"""
+
+
+######################################################################
+# First, let's get some imports out of the way (we will be using all
+# of these later in the code).
+
+from typing import Type, Dict, Any, Tuple, Iterable
+import copy
+import torch
+import torch.nn as nn
+
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+######################################################################
+# For this tutorial, we are going to create a model consisting of convolutions
+# and batch norms. Note that this model has some tricky components - some of
+# the conv/batch norm patterns are hidden within Sequentials and one of the
+# ``BatchNorms`` is wrapped in another Module.
+
+class WrappedBatchNorm(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.mod = nn.BatchNorm2d(1)
+    def forward(self, x):
+        return self.mod(x)
+
+class M(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv1 = nn.Conv2d(1, 1, 1)
+        self.bn1 = nn.BatchNorm2d(1)
+        self.conv2 = nn.Conv2d(1, 1, 1)
+        self.nested = nn.Sequential(
+            nn.BatchNorm2d(1),
+            nn.Conv2d(1, 1, 1),
+        )
+        self.wrapped = WrappedBatchNorm()
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.conv2(x)
+        x = self.nested(x)
+        x = self.wrapped(x)
+        return x
+
+model = M().to(device)
+model.eval()
+
+######################################################################
+# Fusing Convolution with Batch Norm
+# -----------------------------------------
+# One of the primary challenges with trying to automatically fuse convolution
+# and batch norm in PyTorch is that PyTorch does not provide an easy way of
+# accessing the computational graph. torch.compile resolves this problem by
+# capturing the computational graph during compilation, allowing us to apply
+# pattern-based optimizations across the entire model, including operations
+# nested within Sequential modules or wrapped in custom modules.
+import torch._inductor.pattern_matcher as pm
+from torch._inductor.pattern_matcher import register_replacement
+
+######################################################################
+# torch.compile will capture a graph representation of our model. During
+# compilation, modules hidden within Sequential containers and wrapped
+# modules are all inlined into the graph, making them available for
+# pattern matching and optimization.
+
+
+####################################
+# Fusing Convolution with Batch Norm
+# ----------------------------------
+# Unlike some other fusions, fusion of convolution with batch norm does not
+# require any new operators. Instead, as batch norm during inference
+# consists of a pointwise add and multiply, these operations can be "baked"
+# into the preceding convolution's weights. This allows us to remove the batch
+# norm entirely from our model! Read
+# https://nenadmarkus.com/p/fusing-batchnorm-and-conv/ for further details. The
+# code here is copied from
+# https://github.com/pytorch/pytorch/blob/orig/release/1.8/torch/nn/utils/fusion.py
+# clarity purposes.
+def fuse_conv_bn_eval(conv, bn):
+    """
+    Given a conv Module `A` and an batch_norm module `B`, returns a conv
+    module `C` such that C(x) == B(A(x)) in inference mode.
+    """
+    assert(not (conv.training or bn.training)), "Fusion only for eval!"
+    fused_conv = copy.deepcopy(conv)
+
+    fused_conv.weight, fused_conv.bias = \
+        fuse_conv_bn_weights(fused_conv.weight, fused_conv.bias,
+                             bn.running_mean, bn.running_var, bn.eps, bn.weight, bn.bias)
+
+    return fused_conv
+
+def fuse_conv_bn_weights(conv_w, conv_b, bn_rm, bn_rv, bn_eps, bn_w, bn_b):
+    if conv_b is None:
+        conv_b = torch.zeros_like(bn_rm)
+    if bn_w is None:
+        bn_w = torch.ones_like(bn_rm)
+    if bn_b is None:
+        bn_b = torch.zeros_like(bn_rm)
+    bn_var_rsqrt = torch.rsqrt(bn_rv + bn_eps)
+
+    conv_w = conv_w * (bn_w * bn_var_rsqrt).reshape([-1] + [1] * (len(conv_w.shape) - 1))
+    conv_b = (conv_b - bn_rm) * bn_var_rsqrt * bn_w + bn_b
+
+    return torch.nn.Parameter(conv_w), torch.nn.Parameter(conv_b)
+
+
+####################################
+# Pattern Matching with torch.compile
+# ------------------------------------
+# Now that we have our fusion logic, we need to register a pattern that
+# torch.compile's pattern matcher will recognize and replace during
+# compilation.
+
+# Define the pattern we want to match: conv2d followed by batch_norm
+def conv_bn_pattern(x, conv_weight, conv_bias, bn_mean, bn_var, bn_weight, bn_bias):
+    conv_out = torch.nn.functional.conv2d(x, conv_weight, conv_bias)
+    bn_out = torch.nn.functional.batch_norm(
+        conv_out, bn_mean, bn_var, bn_weight, bn_bias,
+        training=False, eps=1e-5
+    )
+    return bn_out
+
+def conv_bn_replacement(x, conv_weight, conv_bias, bn_mean, bn_var, bn_weight, bn_bias):
+    fused_weight, fused_bias = fuse_conv_bn_weights(
+        conv_weight, conv_bias, bn_mean, bn_var, 1e-5, bn_weight, bn_bias
+    )
+    return torch.nn.functional.conv2d(x, fused_weight, fused_bias)
+
+# Example inputs are needed to trace the pattern functions.
+# The inputs should match the function signatures of conv_bn_pattern and conv_bn_replacement.
+# These are used to trace the pattern functions to create the match template.
+# IMPORTANT: The pattern matcher is shape-agnostic! The specific shapes you use here
+# don't limit what shapes will be matched - any valid conv2d->batch_norm sequence
+# will be matched regardless of channels, kernel size, or spatial dimensions.
+# - x: input tensor (batch_size, channels, height, width)
+# - conv_weight: (out_channels, in_channels, kernel_h, kernel_w)
+# - conv_bias: (out_channels,)
+# - bn_mean, bn_var, bn_weight, bn_bias: all have shape (num_features,) matching out_channels
+example_inputs = [
+    torch.randn(1, 1, 4, 4).to(device),  # x: input tensor
+    torch.randn(1, 1, 1, 1).to(device),  # conv_weight: 1 output channel, 1 input channel, 1x1 kernel
+    torch.randn(1).to(device),           # conv_bias: 1 output channel
+    torch.randn(1).to(device),           # bn_mean: batch norm running mean
+    torch.randn(1).to(device),           # bn_var: batch norm running variance
+    torch.randn(1).to(device),           # bn_weight: batch norm weight (gamma)
+    torch.randn(1).to(device),           # bn_bias: batch norm bias (beta)
+]
+
+from torch._inductor.pattern_matcher import PatternMatcherPass
+from torch._inductor import config
+
+# Create a pattern matcher pass and register our pattern
+patterns = PatternMatcherPass()
+
+register_replacement(
+    conv_bn_pattern,
+    conv_bn_replacement,
+    example_inputs,
+    pm.fwd_only,
+    patterns,
+)
+
+# Create a custom pass function that applies our patterns
+def conv_bn_fusion_pass(graph):
+    return patterns.apply(graph)
+
+# Set our custom pass in the config
+config.post_grad_custom_post_pass = conv_bn_fusion_pass
+
+
+######################################################################
+# .. note::
+#       We make some simplifications here for demonstration purposes, such as only
+#       matching 2D convolutions. The pattern matcher in torch.compile
+#       can handle more complex patterns.
+
+######################################################################
+# Testing out our Fusion Pass
+# -----------------------------------------
+# We can now run this fusion pass on our initial toy model and verify that our
+# results are identical. In addition, we can print out the code for our fused
+# model and verify that there are no more batch norms.
+
+from torch._dynamo.utils import counters
+
+# Clear the counters before compilation
+counters.clear()
+
+# Ensure pattern matcher is enabled
+config.pattern_matcher = True
+
+fused_model = torch.compile(model, backend="inductor")
+inp = torch.randn(5, 1, 1, 1).to(device)
+
+# Run the model to trigger compilation and pattern matching
+with torch.no_grad():
+    output = fused_model(inp)
+    expected = model(inp)
+    torch.testing.assert_close(output, expected)
+
+# Check how many patterns were matched
+assert counters['inductor']['pattern_matcher_count'] == 3, "Expected 3 conv-bn patterns to be matched"
+
+# Create a model with different shapes than our example_inputs
+test_model_diff_shape = nn.Sequential(
+    nn.Conv2d(3, 16, 5),
+    nn.BatchNorm2d(16),
+    nn.ReLU(),
+    nn.Conv2d(16, 32, 7),
+    nn.BatchNorm2d(32),
+).to(device).eval()
+
+counters.clear()
+compiled_diff_shape = torch.compile(test_model_diff_shape, backend="inductor")
+test_input_diff_shape = torch.randn(1, 3, 28, 28).to(device)
+with torch.no_grad():
+    compiled_diff_shape(test_input_diff_shape)
+
+# Check how many patterns were matched
+assert counters['inductor']['pattern_matcher_count'] == 2, "Expected 2 conv-bn patterns to be matched"
+
+
+######################################################################
+# Benchmarking our Fusion on ResNet18
+# -----------------------------------
+# We can test our fusion pass on a larger model like ResNet18 and see how much
+# this pass improves inference performance.
+import torchvision.models as models
+import time
+
+rn18 = models.resnet18().to(device)
+rn18.eval()
+
+inp = torch.randn(10, 3, 224, 224).to(device)
+output = rn18(inp)
+
+def benchmark(model, iters=20):
+    with torch.no_grad():
+        for _ in range(10):
+            model(inp)
+        begin = time.time()
+        for _ in range(iters):
+            model(inp)
+        return str(time.time()-begin)
+
+# Benchmark original model
+print("Original model time: ", benchmark(rn18))
+
+# Compile with our custom pattern
+compiled_with_pattern_matching = torch.compile(rn18, backend="inductor")
+
+# Benchmark compiled model
+print("\ntorch.compile (with conv-bn pattern matching and other fusions): ", benchmark(compiled_with_pattern_matching))
+
+
+############
+# Conclusion
+# ----------
+# As we can see, torch.compile provides a powerful way to implement
+# graph transformations and optimizations through pattern matching.
+# By registering custom patterns, we can extend torch.compile's
+# optimization capabilities to handle domain-specific transformations.
+#
+# The conv-bn fusion demonstrated here is just one example of what's
+# possible with torch.compile's pattern matching system.
\ No newline at end of file
diff --git a/intermediate_source/torch_compile_tutorial.py b/intermediate_source/torch_compile_tutorial.py
new file mode 100644
index 00000000000..de31af04dc1
--- /dev/null
+++ b/intermediate_source/torch_compile_tutorial.py
@@ -0,0 +1,619 @@
+# -*- coding: utf-8 -*-
+
+"""
+Introduction to ``torch.compile``
+=================================
+**Author:** William Wen
+"""
+
+######################################################################
+# ``torch.compile`` is the latest method to speed up your PyTorch code!
+# ``torch.compile`` makes PyTorch code run faster by
+# JIT-compiling PyTorch code into optimized kernels,
+# all while requiring minimal code changes.
+#
+# In this tutorial, we cover basic ``torch.compile`` usage,
+# and demonstrate the advantages of ``torch.compile`` over
+# previous PyTorch compiler solutions, such as
+# `TorchScript <https://pytorch.org/docs/stable/jit.html>`__ and
+# `FX Tracing <https://pytorch.org/docs/stable/fx.html#torch.fx.symbolic_trace>`__.
+#
+# **Contents**
+#
+# .. contents::
+#     :local:
+#
+# **Required pip Dependencies**
+#
+# - ``torch >= 2.0``
+# - ``torchvision``
+# - ``numpy``
+# - ``scipy``
+# - ``tabulate``
+#
+# **System Requirements**
+# - A C++ compiler, such as ``g++``
+# - Python development package (``python-devel``/``python-dev``)
+
+######################################################################
+# NOTE: a modern NVIDIA GPU (H100, A100, or V100) is recommended for this tutorial in
+# order to reproduce the speedup numbers shown below and documented elsewhere.
+
+import torch
+import warnings
+
+gpu_ok = False
+if torch.cuda.is_available():
+    device_cap = torch.cuda.get_device_capability()
+    if device_cap in ((7, 0), (8, 0), (9, 0)):
+        gpu_ok = True
+
+if not gpu_ok:
+    warnings.warn(
+        "GPU is not NVIDIA V100, A100, or H100. Speedup numbers may be lower "
+        "than expected."
+    )
+
+######################################################################
+# Basic Usage
+# ------------
+#
+# ``torch.compile`` is included in the latest PyTorch.
+# Running TorchInductor on GPU requires Triton, which is included with the PyTorch 2.0 nightly
+# binary. If Triton is still missing, try installing ``torchtriton`` via pip
+# (``pip install torchtriton --extra-index-url "https://download.pytorch.org/whl/nightly/cu117"``
+# for CUDA 11.7).
+#
+# Arbitrary Python functions can be optimized by passing the callable to
+# ``torch.compile``. We can then call the returned optimized
+# function in place of the original function.
+
+def foo(x, y):
+    a = torch.sin(x)
+    b = torch.cos(y)
+    return a + b
+opt_foo1 = torch.compile(foo)
+print(opt_foo1(torch.randn(10, 10), torch.randn(10, 10)))
+
+######################################################################
+# Alternatively, we can decorate the function.
+t1 = torch.randn(10, 10)
+t2 = torch.randn(10, 10)
+
+@torch.compile
+def opt_foo2(x, y):
+    a = torch.sin(x)
+    b = torch.cos(y)
+    return a + b
+print(opt_foo2(t1, t2))
+
+######################################################################
+# We can also optimize ``torch.nn.Module`` instances.
+
+t = torch.randn(10, 100)
+
+class MyModule(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.lin = torch.nn.Linear(100, 10)
+
+    def forward(self, x):
+        return torch.nn.functional.relu(self.lin(x))
+
+mod = MyModule()
+mod.compile()
+print(mod(t))
+## or:
+# opt_mod = torch.compile(mod)
+# print(opt_mod(t))
+
+######################################################################
+# torch.compile and Nested Calls
+# ------------------------------
+# Nested function calls within the decorated function will also be compiled.
+
+def nested_function(x):
+    return torch.sin(x)
+
+@torch.compile
+def outer_function(x, y):
+    a = nested_function(x)
+    b = torch.cos(y)
+    return a + b
+
+print(outer_function(t1, t2))
+
+######################################################################
+# In the same fashion, when compiling a module all sub-modules and methods
+# within it, that are not in a skip list, are also compiled.
+
+class OuterModule(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.inner_module = MyModule()
+        self.outer_lin = torch.nn.Linear(10, 2)
+
+    def forward(self, x):
+        x = self.inner_module(x)
+        return torch.nn.functional.relu(self.outer_lin(x))
+
+outer_mod = OuterModule()
+outer_mod.compile()
+print(outer_mod(t))
+
+######################################################################
+# We can also disable some functions from being compiled by using
+# ``torch.compiler.disable``. Suppose you want to disable the tracing on just
+# the ``complex_function`` function, but want to continue the tracing back in
+# ``complex_conjugate``. In this case, you can use
+# ``torch.compiler.disable(recursive=False)`` option. Otherwise, the default is
+# ``recursive=True``.
+
+def complex_conjugate(z):
+    return torch.conj(z)
+
+@torch.compiler.disable(recursive=False)
+def complex_function(real, imag):
+    # Assuming this function cause problems in the compilation
+    z = torch.complex(real, imag)
+    return complex_conjugate(z)
+
+def outer_function():
+    real = torch.tensor([2, 3], dtype=torch.float32)
+    imag = torch.tensor([4, 5], dtype=torch.float32)
+    z = complex_function(real, imag)
+    return torch.abs(z)
+
+# Try to compile the outer_function
+try:
+    opt_outer_function = torch.compile(outer_function)
+    print(opt_outer_function())
+except Exception as e:
+    print("Compilation of outer_function failed:", e)
+
+######################################################################
+# Best Practices and Recommendations
+# ----------------------------------
+#
+# Behavior of ``torch.compile`` with Nested Modules and Function Calls
+#
+# When you use ``torch.compile``, the compiler will try to recursively compile
+# every function call inside the target function or module inside the target
+# function or module that is not in a skip list (such as built-ins, some functions in
+# the torch.* namespace).
+# 
+# **Best Practices:**
+#
+# 1. **Top-Level Compilation:** One approach is to compile at the highest level
+# possible (i.e., when the top-level module is initialized/called) and
+# selectively disable compilation when encountering excessive graph breaks or
+# errors. If there are still many compile issues, compile individual
+# subcomponents instead.
+#
+# 2. **Modular Testing:** Test individual functions and modules with ``torch.compile``
+# before integrating them into larger models to isolate potential issues.
+#
+# 3. **Disable Compilation Selectively:** If certain functions or sub-modules
+# cannot be handled by `torch.compile`, use the `torch.compiler.disable` context
+# managers to recursively exclude them from compilation.
+#
+# 4. **Compile Leaf Functions First:** In complex models with multiple nested
+# functions and modules, start by compiling the leaf functions or modules first.
+# For more information see `TorchDynamo APIs for fine-grained tracing <https://pytorch.org/docs/stable/torch.compiler_fine_grain_apis.html>`__.
+#
+# 5. **Prefer ``mod.compile()`` over ``torch.compile(mod)``:** Avoids ``_orig_`` prefix issues in ``state_dict``.
+#
+# 6. **Use ``fullgraph=True`` to catch graph breaks:** Helps ensure end-to-end compilation, maximizing speedup
+# and compatibility with ``torch.export``.
+
+
+######################################################################
+# Demonstrating Speedups
+# -----------------------
+#
+# Let's now demonstrate that using ``torch.compile`` can speed
+# up real models. We will compare standard eager mode and
+# ``torch.compile`` by evaluating and training a ``torchvision`` model on random data.
+#
+# Before we start, we need to define some utility functions.
+
+# Returns the result of running `fn()` and the time it took for `fn()` to run,
+# in seconds. We use CUDA events and synchronization for the most accurate
+# measurements.
+def timed(fn):
+    start = torch.cuda.Event(enable_timing=True)
+    end = torch.cuda.Event(enable_timing=True)
+    start.record()
+    result = fn()
+    end.record()
+    torch.cuda.synchronize()
+    return result, start.elapsed_time(end) / 1000
+
+# Generates random input and targets data for the model, where `b` is
+# batch size.
+def generate_data(b):
+    return (
+        torch.randn(b, 3, 128, 128).to(torch.float32).cuda(),
+        torch.randint(1000, (b,)).cuda(),
+    )
+
+N_ITERS = 10
+
+from torchvision.models import densenet121
+def init_model():
+    return densenet121().to(torch.float32).cuda()
+
+######################################################################
+# First, let's compare inference.
+#
+# Note that in the call to ``torch.compile``, we have the additional
+# ``mode`` argument, which we will discuss below.
+
+model = init_model()
+
+# Reset since we are using a different mode.
+import torch._dynamo
+torch._dynamo.reset()
+
+model_opt = torch.compile(model, mode="reduce-overhead")
+
+inp = generate_data(16)[0]
+with torch.no_grad():
+    print("eager:", timed(lambda: model(inp))[1])
+    print("compile:", timed(lambda: model_opt(inp))[1])
+
+######################################################################
+# Notice that ``torch.compile`` takes a lot longer to complete
+# compared to eager. This is because ``torch.compile`` compiles
+# the model into optimized kernels as it executes. In our example, the
+# structure of the model doesn't change, and so recompilation is not
+# needed. So if we run our optimized model several more times, we should
+# see a significant improvement compared to eager.
+
+eager_times = []
+for i in range(N_ITERS):
+    inp = generate_data(16)[0]
+    with torch.no_grad():
+        _, eager_time = timed(lambda: model(inp))
+    eager_times.append(eager_time)
+    print(f"eager eval time {i}: {eager_time}")
+
+print("~" * 10)
+
+compile_times = []
+for i in range(N_ITERS):
+    inp = generate_data(16)[0]
+    with torch.no_grad():
+        _, compile_time = timed(lambda: model_opt(inp))
+    compile_times.append(compile_time)
+    print(f"compile eval time {i}: {compile_time}")
+print("~" * 10)
+
+import numpy as np
+eager_med = np.median(eager_times)
+compile_med = np.median(compile_times)
+speedup = eager_med / compile_med
+assert(speedup > 1)
+print(f"(eval) eager median: {eager_med}, compile median: {compile_med}, speedup: {speedup}x")
+print("~" * 10)
+
+######################################################################
+# And indeed, we can see that running our model with ``torch.compile``
+# results in a significant speedup. Speedup mainly comes from reducing Python overhead and
+# GPU read/writes, and so the observed speedup may vary on factors such as model
+# architecture and batch size. For example, if a model's architecture is simple
+# and the amount of data is large, then the bottleneck would be
+# GPU compute and the observed speedup may be less significant.
+#
+# You may also see different speedup results depending on the chosen ``mode``
+# argument. The ``"reduce-overhead"`` mode uses CUDA graphs to further reduce
+# the overhead of Python. For your own models,
+# you may need to experiment with different modes to maximize speedup. You can
+# read more about modes `here <https://pytorch.org/get-started/pytorch-2.0/#user-experience>`__.
+#
+# You may might also notice that the second time we run our model with ``torch.compile`` is significantly
+# slower than the other runs, although it is much faster than the first run. This is because the ``"reduce-overhead"``
+# mode runs a few warm-up iterations for CUDA graphs.
+#
+# For general PyTorch benchmarking, you can try using ``torch.utils.benchmark`` instead of the ``timed``
+# function we defined above. We wrote our own timing function in this tutorial to show
+# ``torch.compile``'s compilation latency.
+#
+# Now, let's consider comparing training.
+
+model = init_model()
+opt = torch.optim.Adam(model.parameters())
+
+def train(mod, data):
+    opt.zero_grad(True)
+    pred = mod(data[0])
+    loss = torch.nn.CrossEntropyLoss()(pred, data[1])
+    loss.backward()
+    opt.step()
+
+eager_times = []
+for i in range(N_ITERS):
+    inp = generate_data(16)
+    _, eager_time = timed(lambda: train(model, inp))
+    eager_times.append(eager_time)
+    print(f"eager train time {i}: {eager_time}")
+print("~" * 10)
+
+model = init_model()
+opt = torch.optim.Adam(model.parameters())
+train_opt = torch.compile(train, mode="reduce-overhead")
+
+compile_times = []
+for i in range(N_ITERS):
+    inp = generate_data(16)
+    _, compile_time = timed(lambda: train_opt(model, inp))
+    compile_times.append(compile_time)
+    print(f"compile train time {i}: {compile_time}")
+print("~" * 10)
+
+eager_med = np.median(eager_times)
+compile_med = np.median(compile_times)
+speedup = eager_med / compile_med
+assert(speedup > 1)
+print(f"(train) eager median: {eager_med}, compile median: {compile_med}, speedup: {speedup}x")
+print("~" * 10)
+
+######################################################################
+# Again, we can see that ``torch.compile`` takes longer in the first
+# iteration, as it must compile the model, but in subsequent iterations, we see
+# significant speedups compared to eager.
+#
+# We remark that the speedup numbers presented in this tutorial are for
+# demonstration purposes only. Official speedup values can be seen at the
+# `TorchInductor performance dashboard <https://hud.pytorch.org/benchmark/compilers>`__.
+
+######################################################################
+# Comparison to TorchScript and FX Tracing
+# -----------------------------------------
+#
+# We have seen that ``torch.compile`` can speed up PyTorch code.
+# Why else should we use ``torch.compile`` over existing PyTorch
+# compiler solutions, such as TorchScript or FX Tracing? Primarily, the
+# advantage of ``torch.compile`` lies in its ability to handle
+# arbitrary Python code with minimal changes to existing code.
+#
+# One case that ``torch.compile`` can handle that other compiler
+# solutions struggle with is data-dependent control flow (the
+# ``if x.sum() < 0:`` line below).
+
+def f1(x, y):
+    if x.sum() < 0:
+        return -y
+    return y
+
+# Test that `fn1` and `fn2` return the same result, given
+# the same arguments `args`. Typically, `fn1` will be an eager function
+# while `fn2` will be a compiled function (torch.compile, TorchScript, or FX graph).
+def test_fns(fn1, fn2, args):
+    out1 = fn1(*args)
+    out2 = fn2(*args)
+    return torch.allclose(out1, out2)
+
+inp1 = torch.randn(5, 5)
+inp2 = torch.randn(5, 5)
+
+######################################################################
+# TorchScript tracing ``f1`` results in
+# silently incorrect results, since only the actual control flow path
+# is traced.
+
+traced_f1 = torch.jit.trace(f1, (inp1, inp2))
+print("traced 1, 1:", test_fns(f1, traced_f1, (inp1, inp2)))
+print("traced 1, 2:", test_fns(f1, traced_f1, (-inp1, inp2)))
+
+######################################################################
+# FX tracing ``f1`` results in an error due to the presence of
+# data-dependent control flow.
+
+import traceback as tb
+try:
+    torch.fx.symbolic_trace(f1)
+except:
+    tb.print_exc()
+
+######################################################################
+# If we provide a value for ``x`` as we try to FX trace ``f1``, then
+# we run into the same problem as TorchScript tracing, as the data-dependent
+# control flow is removed in the traced function.
+
+fx_f1 = torch.fx.symbolic_trace(f1, concrete_args={"x": inp1})
+print("fx 1, 1:", test_fns(f1, fx_f1, (inp1, inp2)))
+print("fx 1, 2:", test_fns(f1, fx_f1, (-inp1, inp2)))
+
+######################################################################
+# Now we can see that ``torch.compile`` correctly handles
+# data-dependent control flow.
+
+# Reset since we are using a different mode.
+torch._dynamo.reset()
+
+compile_f1 = torch.compile(f1)
+print("compile 1, 1:", test_fns(f1, compile_f1, (inp1, inp2)))
+print("compile 1, 2:", test_fns(f1, compile_f1, (-inp1, inp2)))
+print("~" * 10)
+
+######################################################################
+# TorchScript scripting can handle data-dependent control flow, but this
+# solution comes with its own set of problems. Namely, TorchScript scripting
+# can require major code changes and will raise errors when unsupported Python
+# is used.
+#
+# In the example below, we forget TorchScript type annotations and we receive
+# a TorchScript error because the input type for argument ``y``, an ``int``,
+# does not match with the default argument type, ``torch.Tensor``.
+
+def f2(x, y):
+    return x + y
+
+inp1 = torch.randn(5, 5)
+inp2 = 3
+
+script_f2 = torch.jit.script(f2)
+try:
+    script_f2(inp1, inp2)
+except:
+    tb.print_exc()
+
+######################################################################
+# However, ``torch.compile`` is easily able to handle ``f2``.
+
+compile_f2 = torch.compile(f2)
+print("compile 2:", test_fns(f2, compile_f2, (inp1, inp2)))
+print("~" * 10)
+
+######################################################################
+# Another case that ``torch.compile`` handles well compared to
+# previous compilers solutions is the usage of non-PyTorch functions.
+
+import scipy
+def f3(x):
+    x = x * 2
+    x = scipy.fft.dct(x.numpy())
+    x = torch.from_numpy(x)
+    x = x * 2
+    return x
+
+######################################################################
+# TorchScript tracing treats results from non-PyTorch function calls
+# as constants, and so our results can be silently wrong.
+
+inp1 = torch.randn(5, 5)
+inp2 = torch.randn(5, 5)
+traced_f3 = torch.jit.trace(f3, (inp1,))
+print("traced 3:", test_fns(f3, traced_f3, (inp2,)))
+
+######################################################################
+# TorchScript scripting and FX tracing disallow non-PyTorch function calls.
+
+try:
+    torch.jit.script(f3)
+except:
+    tb.print_exc()
+
+try:
+    torch.fx.symbolic_trace(f3)
+except:
+    tb.print_exc()
+
+######################################################################
+# In comparison, ``torch.compile`` is easily able to handle
+# the non-PyTorch function call.
+
+compile_f3 = torch.compile(f3)
+print("compile 3:", test_fns(f3, compile_f3, (inp2,)))
+
+######################################################################
+# TorchDynamo and FX Graphs
+# --------------------------
+#
+# One important component of ``torch.compile`` is TorchDynamo.
+# TorchDynamo is responsible for JIT compiling arbitrary Python code into
+# `FX graphs <https://pytorch.org/docs/stable/fx.html#torch.fx.Graph>`__, which can
+# then be further optimized. TorchDynamo extracts FX graphs by analyzing Python bytecode
+# during runtime and detecting calls to PyTorch operations.
+#
+# Normally, TorchInductor, another component of ``torch.compile``,
+# further compiles the FX graphs into optimized kernels,
+# but TorchDynamo allows for different backends to be used. In order to inspect
+# the FX graphs that TorchDynamo outputs, let us create a custom backend that
+# outputs the FX graph and simply returns the graph's unoptimized forward method.
+
+from typing import List
+def custom_backend(gm: torch.fx.GraphModule, example_inputs: List[torch.Tensor]):
+    print("custom backend called with FX graph:")
+    gm.graph.print_tabular()
+    return gm.forward
+
+# Reset since we are using a different backend.
+torch._dynamo.reset()
+
+opt_model = torch.compile(init_model(), backend=custom_backend)
+opt_model(generate_data(16)[0])
+
+######################################################################
+# Using our custom backend, we can now see how TorchDynamo is able to handle
+# data-dependent control flow. Consider the function below, where the line
+# ``if b.sum() < 0`` is the source of data-dependent control flow.
+
+def bar(a, b):
+    x = a / (torch.abs(a) + 1)
+    if b.sum() < 0:
+        b = b * -1
+    return x * b
+
+opt_bar = torch.compile(bar, backend=custom_backend)
+inp1 = torch.randn(10)
+inp2 = torch.randn(10)
+opt_bar(inp1, inp2)
+opt_bar(inp1, -inp2)
+
+######################################################################
+# The output reveals that TorchDynamo extracted 3 different FX graphs
+# corresponding the following code (order may differ from the output above):
+#
+# 1. ``x = a / (torch.abs(a) + 1)``
+# 2. ``b = b * -1; return x * b``
+# 3. ``return x * b``
+#
+# When TorchDynamo encounters unsupported Python features, such as data-dependent
+# control flow, it breaks the computation graph, lets the default Python
+# interpreter handle the unsupported code, then resumes capturing the graph.
+#
+# Let's investigate by example how TorchDynamo would step through ``bar``.
+# If ``b.sum() < 0``, then TorchDynamo would run graph 1, let
+# Python determine the result of the conditional, then run
+# graph 2. On the other hand, if ``not b.sum() < 0``, then TorchDynamo
+# would run graph 1, let Python determine the result of the conditional, then
+# run graph 3.
+#
+# This highlights a major difference between TorchDynamo and previous PyTorch
+# compiler solutions. When encountering unsupported Python features,
+# previous solutions either raise an error or silently fail.
+# TorchDynamo, on the other hand, will break the computation graph.
+#
+# We can see where TorchDynamo breaks the graph by using ``torch._dynamo.explain``:
+
+# Reset since we are using a different backend.
+torch._dynamo.reset()
+explain_output = torch._dynamo.explain(bar)(torch.randn(10), torch.randn(10))
+print(explain_output)
+
+######################################################################
+# In order to maximize speedup, graph breaks should be limited.
+# We can force TorchDynamo to raise an error upon the first graph
+# break encountered by using ``fullgraph=True``:
+
+opt_bar = torch.compile(bar, fullgraph=True)
+try:
+    opt_bar(torch.randn(10), torch.randn(10))
+except:
+    tb.print_exc()
+
+######################################################################
+# And below, we demonstrate that TorchDynamo does not break the graph on
+# the model we used above for demonstrating speedups.
+
+opt_model = torch.compile(init_model(), fullgraph=True)
+print(opt_model(generate_data(16)[0]))
+
+######################################################################
+# We can use ``torch.export`` (from PyTorch 2.1+) to extract a single, exportable
+# FX graph from the input PyTorch program. The exported graph is intended to be
+# run on different (i.e. Python-less) environments. One important restriction
+# is that the ``torch.export`` does not support graph breaks. Please check
+# `this tutorial <https://pytorch.org/tutorials/intermediate/torch_export_tutorial.html>`__
+# for more details on ``torch.export``.
+
+######################################################################
+# Conclusion
+# ------------
+#
+# In this tutorial, we introduced ``torch.compile`` by covering
+# basic usage, demonstrating speedups over eager mode, comparing to previous
+# PyTorch compiler solutions, and briefly investigating TorchDynamo and its interactions
+# with FX graphs. We hope that you will give ``torch.compile`` a try!
diff --git a/intermediate_source/torch_export_nightly_tutorial.rst b/intermediate_source/torch_export_nightly_tutorial.rst
new file mode 100644
index 00000000000..e7ef2e88153
--- /dev/null
+++ b/intermediate_source/torch_export_nightly_tutorial.rst
@@ -0,0 +1,10 @@
+torch.export Nightly Tutorial
+=============================
+
+This tutorial has been moved to https://pytorch.org/tutorials/intermediate/torch_export_tutorial.html
+
+It will redirect in 3 seconds.
+
+.. raw:: html
+
+   <meta http-equiv="Refresh" content="3; url='https://pytorch.org/tutorials/intermediate/torch_export_tutorial.html'" />
diff --git a/intermediate_source/torch_export_tutorial.py b/intermediate_source/torch_export_tutorial.py
new file mode 100644
index 00000000000..3ca6d09a52f
--- /dev/null
+++ b/intermediate_source/torch_export_tutorial.py
@@ -0,0 +1,1010 @@
+# -*- coding: utf-8 -*-
+
+"""
+torch.export Tutorial
+===================================================
+**Author:** William Wen, Zhengxu Chen, Angela Yi, Pian Pawakapan
+"""
+
+######################################################################
+#
+# .. warning::
+#
+#     ``torch.export`` and its related features are in prototype status and are subject to backwards compatibility
+#     breaking changes. This tutorial provides a snapshot of ``torch.export`` usage as of PyTorch 2.5.
+#
+# :func:`torch.export` is the PyTorch 2.X way to export PyTorch models into
+# standardized model representations, intended
+# to be run on different (i.e. Python-less) environments. The official
+# documentation can be found `here <https://pytorch.org/docs/main/export.html>`__.
+#
+# In this tutorial, you will learn how to use :func:`torch.export` to extract
+# ``ExportedProgram``'s (i.e. single-graph representations) from PyTorch programs.
+# We also detail some considerations/modifications that you may need
+# to make in order to make your model compatible with ``torch.export``.
+#
+# **Contents**
+#
+# .. contents::
+#     :local:
+
+######################################################################
+# Basic Usage
+# -----------
+#
+# ``torch.export`` extracts single-graph representations from PyTorch programs
+# by tracing the target function, given example inputs.
+# ``torch.export.export()`` is the main entry point for ``torch.export``.
+#
+# In this tutorial, ``torch.export`` and ``torch.export.export()`` are practically synonymous,
+# though ``torch.export`` generally refers to the PyTorch 2.X export process, and ``torch.export.export()``
+# generally refers to the actual function call.
+#
+# The signature of ``torch.export.export()`` is:
+#
+# .. code-block:: python
+#
+#     export(
+#         mod: torch.nn.Module,
+#         args: Tuple[Any, ...],
+#         kwargs: Optional[Dict[str, Any]] = None,
+#         *,
+#         dynamic_shapes: Optional[Dict[str, Dict[int, Dim]]] = None
+#     ) -> ExportedProgram
+#
+# ``torch.export.export()`` traces the tensor computation graph from calling ``mod(*args, **kwargs)``
+# and wraps it in an ``ExportedProgram``, which can be serialized or executed later with
+# different inputs. To execute the ``ExportedProgram`` we can call ``.module()``
+# on it to return a ``torch.nn.Module`` which is callable, just like the
+# original program.
+# We will detail the ``dynamic_shapes`` argument later in the tutorial.
+
+import torch
+from torch.export import export
+
+class MyModule(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.lin = torch.nn.Linear(100, 10)
+
+    def forward(self, x, y):
+        return torch.nn.functional.relu(self.lin(x + y), inplace=True)
+
+mod = MyModule()
+exported_mod = export(mod, (torch.randn(8, 100), torch.randn(8, 100)))
+print(type(exported_mod))
+print(exported_mod.module()(torch.randn(8, 100), torch.randn(8, 100)))
+
+
+######################################################################
+# Let's review some attributes of ``ExportedProgram`` that are of interest.
+#
+# The ``graph`` attribute is an `FX graph <https://pytorch.org/docs/stable/fx.html#torch.fx.Graph>`__
+# traced from the function we exported, that is, the computation graph of all PyTorch operations.
+# The FX graph is in "ATen IR" meaning that it contains only "ATen-level" operations.
+#
+# The ``graph_signature`` attribute gives a more detailed description of the
+# input and output nodes in the exported graph, describing which ones are
+# parameters, buffers, user inputs, or user outputs.
+#
+# The ``range_constraints`` attributes will be covered later.
+
+print(exported_mod)
+
+######################################################################
+# See the ``torch.export`` `documentation <https://pytorch.org/docs/main/export.html#torch.export.export>`__
+# for more details.
+
+######################################################################
+# Graph Breaks
+# ------------
+#
+# Although ``torch.export`` shares components with ``torch.compile``,
+# the key limitation of ``torch.export``, especially when compared to
+# ``torch.compile``, is that it does not support graph breaks. This is because
+# handling graph breaks involves interpreting the unsupported operation with
+# default Python evaluation, which is incompatible with the export use case.
+# Therefore, in order to make your model code compatible with ``torch.export``,
+# you will need to modify your code to remove graph breaks.
+#
+# A graph break is necessary in cases such as:
+#
+# - data-dependent control flow
+
+class Bad1(torch.nn.Module):
+    def forward(self, x):
+        if x.sum() > 0:
+            return torch.sin(x)
+        return torch.cos(x)
+
+import traceback as tb
+try:
+    export(Bad1(), (torch.randn(3, 3),))
+except Exception:
+    tb.print_exc()
+
+######################################################################
+# - accessing tensor data with ``.data``
+
+class Bad2(torch.nn.Module):
+    def forward(self, x):
+        x.data[0, 0] = 3
+        return x
+
+try:
+    export(Bad2(), (torch.randn(3, 3),))
+except Exception:
+    tb.print_exc()
+
+######################################################################
+# - calling unsupported functions (such as many built-in functions)
+
+class Bad3(torch.nn.Module):
+    def forward(self, x):
+        x = x + 1
+        return x + id(x)
+
+try:
+    export(Bad3(), (torch.randn(3, 3),))
+except Exception:
+    tb.print_exc()
+
+
+######################################################################
+# Non-Strict Export
+# -----------------
+#
+# To trace the program, ``torch.export`` uses TorchDynamo by default, a byte
+# code analysis engine, to symbolically analyze the Python code and build a
+# graph based on the results. This analysis allows ``torch.export`` to provide
+# stronger guarantees about safety, but not all Python code is supported,
+# causing these graph breaks.
+#
+# To address this issue, in PyTorch 2.3, we introduced a new mode of
+# exporting called non-strict mode, where we trace through the program using the
+# Python interpreter executing it exactly as it would in eager mode, allowing us
+# to skip over unsupported Python features. This is done through adding a
+# ``strict=False`` flag.
+#
+# Looking at some of the previous examples which resulted in graph breaks:
+
+######################################################################
+# - Calling unsupported functions (such as many built-in functions) traces
+# through, but in this case, ``id(x)`` gets specialized as a constant integer in
+# the graph. This is because ``id(x)`` is not a tensor operation, so the
+# operation is not recorded in the graph.
+
+class Bad3(torch.nn.Module):
+    def forward(self, x):
+        x = x + 1
+        return x + id(x)
+
+bad3_nonstrict = export(Bad3(), (torch.randn(3, 3),), strict=False)
+print(bad3_nonstrict)
+print(bad3_nonstrict.module()(torch.ones(3, 3)))
+
+
+######################################################################
+# However, there are still some features that require rewrites to the original
+# module:
+
+######################################################################
+# Control Flow Ops
+# ----------------
+#
+# ``torch.export`` actually does support data-dependent control flow.
+# But these need to be expressed using control flow ops. For example,
+# we can fix the control flow example above using the ``cond`` op, like so:
+
+class Bad1Fixed(torch.nn.Module):
+    def forward(self, x):
+        def true_fn(x):
+            return torch.sin(x)
+        def false_fn(x):
+            return torch.cos(x)
+        return torch.cond(x.sum() > 0, true_fn, false_fn, [x])
+
+exported_bad1_fixed = export(Bad1Fixed(), (torch.randn(3, 3),))
+print(exported_bad1_fixed)
+print(exported_bad1_fixed.module()(torch.ones(3, 3)))
+print(exported_bad1_fixed.module()(-torch.ones(3, 3)))
+
+######################################################################
+# There are limitations to ``cond`` that one should be aware of:
+#
+# - The predicate (i.e. ``x.sum() > 0``) must result in a boolean or a single-element tensor.
+# - The operands (i.e. ``[x]``) must be tensors.
+# - The branch function (i.e. ``true_fn`` and ``false_fn``) signature must match with the
+#   operands and they must both return a single tensor with the same metadata (for example, ``dtype``, ``shape``, etc.).
+# - Branch functions cannot mutate input or global variables.
+# - Branch functions cannot access closure variables, except for ``self`` if the function is
+#   defined in the scope of a method.
+#
+# For more details about ``cond``, check out the `cond documentation <https://pytorch.org/docs/main/cond.html>`__.
+
+######################################################################
+# We can also use ``map``, which applies a function across the first dimension
+# of the first tensor argument.
+
+from torch._higher_order_ops.map import map as torch_map
+
+class MapModule(torch.nn.Module):
+    def forward(self, xs, y, z):
+        def body(x, y, z):
+            return x + y + z
+
+        return torch_map(body, xs, y, z)
+
+inps = (torch.ones(6, 4), torch.tensor(5), torch.tensor(4))
+exported_map_example = export(MapModule(), inps)
+print(exported_map_example)
+print(exported_map_example.module()(*inps))
+
+######################################################################
+# Other control flow ops include ``while_loop``, ``associative_scan``, and
+# ``scan``. For more documentation on each operator, please refer to
+# `this page <https://github.com/pytorch/pytorch/tree/main/torch/_higher_order_ops>`__.
+
+######################################################################
+# Constraints/Dynamic Shapes
+# --------------------------
+#
+# This section covers dynamic behavior and representation of exported programs. Dynamic behavior is
+# subjective to the particular model being exported, so for the most part of this tutorial, we'll focus
+# on this particular toy model (with the resulting tensor shapes annotated):
+
+class DynamicModel(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.l = torch.nn.Linear(5, 3)
+
+    def forward(
+        self,
+        w: torch.Tensor,  # [6, 5]
+        x: torch.Tensor,  # [4]
+        y: torch.Tensor,  # [8, 4]
+        z: torch.Tensor,  # [32]
+    ):
+        x0 = x + y  # [8, 4]
+        x1 = self.l(w)  # [6, 3]
+        x2 = x0.flatten()  # [32]
+        x3 = x2 + z  # [32]
+        return x1, x3
+
+######################################################################
+# By default, ``torch.export`` produces a static program. One consequence of this is that at runtime,
+# the program won't work on inputs with different shapes, even if they're valid in eager mode.
+
+w = torch.randn(6, 5)
+x = torch.randn(4)
+y = torch.randn(8, 4)
+z = torch.randn(32)
+model = DynamicModel()
+ep = export(model, (w, x, y, z))
+model(w, x, torch.randn(3, 4), torch.randn(12))
+try:
+    ep.module()(w, x, torch.randn(3, 4), torch.randn(12))
+except Exception:
+    tb.print_exc()
+
+######################################################################
+# Basic concepts: symbols and guards
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+#
+# To enable dynamism, ``export()`` provides a ``dynamic_shapes`` argument. The easiest way to work with
+# dynamic shapes is using ``Dim.AUTO`` and looking at the program that's returned. Dynamic behavior is specified
+# at a input dimension-level; for each input we can specify a tuple of values:
+
+from torch.export.dynamic_shapes import Dim
+
+dynamic_shapes = {
+    "w": (Dim.AUTO, Dim.AUTO),
+    "x": (Dim.AUTO,),
+    "y": (Dim.AUTO, Dim.AUTO),
+    "z": (Dim.AUTO,),
+}
+ep = export(model, (w, x, y, z), dynamic_shapes=dynamic_shapes)
+
+######################################################################
+# Before we look at the program that's produced, let's understand what specifying ``dynamic_shapes`` entails,
+# and how that interacts with export. For every input dimension where a ``Dim`` object is specified, a symbol is
+# `allocated <https://pytorch.org/docs/main/export.programming_model.html#basics-of-symbolic-shapes>`_,
+# taking on a range of ``[2, inf]`` (why not ``[0, inf]`` or ``[1, inf]``? we'll explain later in the
+# 0/1 specialization section).
+#
+# Export then runs model tracing, looking at each operation that's performed by the model. Each individual operation can emit
+# what's called "guards"; basically boolean condition that are required to be true for the program to be valid.
+# When guards involve symbols allocated for input dimensions, the program contains restrictions on what input shapes are valid;
+# i.e. the program's dynamic behavior. The symbolic shapes subsystem is the part responsible for taking in all the emitted guards
+# and producing a final program representation that adheres to all of these guards. Before we see this "final representation" in
+# an ``ExportedProgram``, let's look at the guards emitted by the toy model we're tracing.
+#
+# Here, each forward input tensor is annotated with the symbol allocated at the start of tracing:
+
+class DynamicModel(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.l = torch.nn.Linear(5, 3)
+
+    def forward(
+        self,
+        w: torch.Tensor,  # [s0, s1]
+        x: torch.Tensor,  # [s2]
+        y: torch.Tensor,  # [s3, s4]
+        z: torch.Tensor,  # [s5]
+    ):
+        x0 = x + y  # guard: s2 == s4
+        x1 = self.l(w)  # guard: s1 == 5
+        x2 = x0.flatten()  # no guard added here
+        x3 = x2 + z  # guard: s3 * s4 == s5
+        return x1, x3
+
+######################################################################
+# Let's understand each of the operations and the emitted guards:
+#
+# - ``x0 = x + y``: This is an element-wise add with broadcasting, since ``x`` is a 1-d tensor and ``y`` a 2-d tensor. ``x`` is broadcasted along the last dimension of ``y``, emitting the guard ``s2 == s4``.
+# - ``x1 = self.l(w)``: Calling ``nn.Linear()`` performs a matrix multiplication with model parameters. In export, parameters, buffers, and constants are considered program state, which is considered static, and so this is a matmul between a dynamic input (``w: [s0, s1]``), and a statically-shaped tensor. This emits the guard ``s1 == 5``.
+# - ``x2 = x0.flatten()``: This call actually doesn't emit any guards! (at least none relevant to input shapes)
+# - ``x3 = x2 + z``: ``x2`` has shape ``[s3*s4]`` after flattening, and this element-wise add emits ``s3 * s4 == s5``.
+#
+# Writing all of these guards down and summarizing is almost like a mathematical proof, which is what the symbolic shapes
+# subsystem tries to do! In summary, we can conclude that the program must have the following input shapes to be valid:
+#
+# - ``w: [s0, 5]``
+# - ``x: [s2]``
+# - ``y: [s3, s2]``
+# - ``z: [s2*s3]``
+#
+# And when we do finally print out the exported program to see our result, those shapes are what we see annotated on the
+# corresponding inputs:
+
+print(ep)
+
+######################################################################
+# Another feature to notice is the range_constraints field above, which contains a valid range for each symbol. This isn't
+# so interesting currently, since this export call doesn't emit any guards related to symbol bounds and each base symbol has
+# a generic bound, but this will come up later.
+#
+# So far, because we've been exporting this toy model, this experience has not been representative of how hard
+# it typically is to debug dynamic shapes guards & issues. In most cases it isn't obvious what guards are being emitted,
+# and which operations and parts of user code are responsible. For this toy model we pinpoint the exact lines, and the guards
+# are rather intuitive.
+#
+# In more complicated cases, a helpful first step is always to enable verbose logging. This can be done either with the environment
+# variable ``TORCH_LOGS="+dynamic"``, or interactively with ``torch._logging.set_logs(dynamic=10)``:
+
+torch._logging.set_logs(dynamic=10)
+ep = export(model, (w, x, y, z), dynamic_shapes=dynamic_shapes)
+
+######################################################################
+# This spits out quite a handful, even with this simple toy model. The log lines here have been cut short at front and end
+# to ignore unnecessary info, but looking through the logs we can see the lines relevant to what we described above;
+# e.g. the allocation of symbols:
+
+"""
+create_symbol s0 = 6 for L['w'].size()[0] [2, int_oo] (_dynamo/variables/builder.py:2841 in <lambda>)
+create_symbol s1 = 5 for L['w'].size()[1] [2, int_oo] (_dynamo/variables/builder.py:2841 in <lambda>)
+runtime_assert True == True [statically known]
+create_symbol s2 = 4 for L['x'].size()[0] [2, int_oo] (_dynamo/variables/builder.py:2841 in <lambda>)
+create_symbol s3 = 8 for L['y'].size()[0] [2, int_oo] (_dynamo/variables/builder.py:2841 in <lambda>)
+create_symbol s4 = 4 for L['y'].size()[1] [2, int_oo] (_dynamo/variables/builder.py:2841 in <lambda>)
+create_symbol s5 = 32 for L['z'].size()[0] [2, int_oo] (_dynamo/variables/builder.py:2841 in <lambda>)
+"""
+
+######################################################################
+# The lines with `create_symbol` show when a new symbol has been allocated, and the logs also identify the tensor variable names
+# and dimensions they've been allocated for. In other lines we can also see the guards emitted:
+
+"""
+runtime_assert Eq(s2, s4) [guard added] x0 = x + y  # output shape: [8, 4]  # dynamic_shapes_tutorial.py:16 in forward (_subclasses/fake_impls.py:845 in infer_size), for more info run with TORCHDYNAMO_EXTENDED_DEBUG_GUARD_ADDED="Eq(s2, s4)"
+runtime_assert Eq(s1, 5) [guard added] x1 = self.l(w)  # [6, 3]  # dynamic_shapes_tutorial.py:17 in forward (_meta_registrations.py:2127 in meta_mm), for more info run with TORCHDYNAMO_EXTENDED_DEBUG_GUARD_ADDED="Eq(s1, 5)"
+runtime_assert Eq(s2*s3, s5) [guard added] x3 = x2 + z  # [32]  # dynamic_shapes_tutorial.py:19 in forward (_subclasses/fake_impls.py:845 in infer_size), for more info run with TORCHDYNAMO_EXTENDED_DEBUG_GUARD_ADDED="Eq(s2*s3, s5)"
+"""
+
+######################################################################
+# Next to the ``[guard added]`` messages, we also see the responsible user lines of code - luckily here the model is simple enough.
+# In many real-world cases it's not so straightforward: high-level torch operations can have complicated fake-kernel implementations
+# or operator decompositions that complicate where and what guards are emitted. In such cases the best way to dig deeper and investigate
+# is to follow the logs' suggestion, and re-run with environment variable ``TORCHDYNAMO_EXTENDED_DEBUG_GUARD_ADDED="..."``, to further
+# attribute the guard of interest.
+#
+# ``Dim.AUTO`` is just one of the available options for interacting with ``dynamic_shapes``; as of writing this 2 other options are available:
+# ``Dim.DYNAMIC``, and ``Dim.STATIC``. ``Dim.STATIC`` simply marks a dimension static, while ``Dim.DYNAMIC`` is similar to ``Dim.AUTO`` in all
+# ways except one: it raises an error when specializing to a constant; this is designed to maintain dynamism. See for example what happens when a
+# static guard is emitted on a dynamically-marked dimension:
+
+dynamic_shapes["w"] = (Dim.AUTO, Dim.DYNAMIC)
+try:
+    export(model, (w, x, y, z), dynamic_shapes=dynamic_shapes)
+except Exception:
+    tb.print_exc()
+
+######################################################################
+# Static guards also aren't always inherent to the model; they can also come from user specifications. In fact, a common pitfall leading to shape
+# specializations is when the user specifies conflicting markers for equivalent dimensions; one dynamic and another static. The same error type is
+# raised when this is the case for ``x.shape[0]`` and ``y.shape[1]``:
+
+dynamic_shapes["w"] = (Dim.AUTO, Dim.AUTO)
+dynamic_shapes["x"] = (Dim.STATIC,)
+dynamic_shapes["y"] = (Dim.AUTO, Dim.DYNAMIC)
+try:
+    export(model, (w, x, y, z), dynamic_shapes=dynamic_shapes)
+except Exception:
+    tb.print_exc()
+
+######################################################################
+# Here you might ask why export "specializes", i.e. why we resolve this static/dynamic conflict by going with the static route. The answer is because
+# of the symbolic shapes system described above, of symbols and guards. When ``x.shape[0]`` is marked static, we don't allocate a symbol, and compile
+# treating this shape as a concrete integer 4. A symbol is allocated for ``y.shape[1]``, and so we finally emit the guard ``s3 == 4``, leading to
+# specialization.
+#
+# One feature of export is that during tracing, statements like asserts, ``torch._check()``, and ``if/else`` conditions will also emit guards.
+# See what happens when we augment the existing model with such statements:
+
+class DynamicModel(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.l = torch.nn.Linear(5, 3)
+
+    def forward(self, w, x, y, z):
+        assert w.shape[0] <= 512
+        torch._check(x.shape[0] >= 4)
+        if w.shape[0] == x.shape[0] + 2:
+            x0 = x + y
+            x1 = self.l(w)
+            x2 = x0.flatten()
+            x3 = x2 + z
+            return x1, x3
+        else:
+            return w
+
+dynamic_shapes = {
+    "w": (Dim.AUTO, Dim.AUTO),
+    "x": (Dim.AUTO,),
+    "y": (Dim.AUTO, Dim.AUTO),
+    "z": (Dim.AUTO,),
+}
+try:
+    ep = export(DynamicModel(), (w, x, y, z), dynamic_shapes=dynamic_shapes)
+except Exception:
+    tb.print_exc()
+
+######################################################################
+# Each of these statements emits an additional guard, and the exported program shows the changes; ``s0`` is eliminated in favor of ``s2 + 2``,
+# and ``s2`` now contains lower and upper bounds, reflected in ``range_constraints``.
+#
+# For the if/else condition, you might ask why the True branch was taken, and why it wasn't the ``w.shape[0] != x.shape[0] + 2`` guard that
+# got emitted from tracing. The answer is that export is guided by the sample inputs provided by tracing, and specializes on the branches taken.
+# If different sample input shapes were provided that fail the ``if`` condition, export would trace and emit guards corresponding to the ``else`` branch.
+# Additionally, you might ask why we traced only the ``if`` branch, and if it's possible to maintain control-flow in your program and keep both branches
+# alive. For that, refer to rewriting your model code following the ``Control Flow Ops`` section above.
+
+######################################################################
+# 0/1 specialization
+# ^^^^^^^^^^^^^^^^^^
+#
+# Since we're talking about guards and specializations, it's a good time to talk about the 0/1 specialization issue we brought up earlier.
+# The bottom line is that export will specialize on sample input dimensions with value 0 or 1, because these shapes have trace-time properties that
+# don't generalize to other shapes. For example, size 1 tensors can broadcast while other sizes fail; and size 0 ... . This just means that you should
+# specify 0/1 sample inputs when you'd like your program to hardcode them, and non-0/1 sample inputs when dynamic behavior is desirable. See what happens
+# at runtime when we export this linear layer:
+
+ep = export(
+    torch.nn.Linear(4, 3),
+    (torch.randn(1, 4),),
+    dynamic_shapes={
+        "input": (Dim.AUTO, Dim.STATIC),
+    },
+)
+try:
+    ep.module()(torch.randn(2, 4))
+except Exception:
+    tb.print_exc()
+
+######################################################################
+# Named Dims
+# ^^^^^^^^^^
+#
+# So far we've only been talking about 3 ways to specify dynamic shapes: ``Dim.AUTO``, ``Dim.DYNAMIC``, and ``Dim.STATIC``. The attraction of these is the
+# low-friction user experience; all the guards emitted during model tracing are adhered to, and dynamic behavior like min/max ranges, relations, and static/dynamic
+# dimensions are automatically figured out underneath export. The dynamic shapes subsystem essentially acts as a "discovery" process, summarizing these guards
+# and presenting what export believes is the overall dynamic behavior of the program. The drawback of this design appears once the user has stronger expectations or
+# beliefs about the dynamic behavior of these models - maybe there is a strong desire on dynamism and specializations on particular dimensions are to be avoided at
+# all costs, or maybe we just want to catch changes in dynamic behavior with changes to the original model code, or possibly underlying decompositions or meta-kernels.
+# These changes won't be detected and the ``export()`` call will most likely succeed, unless tests are in place that check the resulting ``ExportedProgram`` representation.
+#
+# For such cases, our stance is to recommend the "traditional" way of specifying dynamic shapes, which longer-term users of export might be familiar with: named ``Dims``:
+
+dx = Dim("dx", min=4, max=256)
+dh = Dim("dh", max=512)
+dynamic_shapes = {
+    "x": (dx, None),
+    "y": (2 * dx, dh),
+}
+
+######################################################################
+# This style of dynamic shapes allows the user to specify what symbols are allocated for input dimensions, min/max bounds on those symbols, and places restrictions on the
+# dynamic behavior of the ``ExportedProgram`` produced; ``ConstraintViolation`` errors will be raised if model tracing emits guards that conflict with the relations or static/dynamic
+# specifications given. For example, in the above specification, the following is asserted:
+#
+# - ``x.shape[0]`` is to have range ``[4, 256]``, and related to ``y.shape[0]`` by ``y.shape[0] == 2 * x.shape[0]``.
+# - ``x.shape[1]`` is static.
+# - ``y.shape[1]`` has range ``[2, 512]``, and is unrelated to any other dimension.
+#
+# In this design, we allow relations between dimensions to be specified with univariate linear expressions: ``A * dim + B`` can be specified for any dimension. This allows users
+# to specify more complex constraints like integer divisibility for dynamic dimensions:
+
+dx = Dim("dx", min=4, max=512)
+dynamic_shapes = {
+    "x": (4 * dx, None)  # x.shape[0] has range [16, 2048], and is divisible by 4.
+}
+
+######################################################################
+# Constraint violations, suggested fixes
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+#
+# One common issue with this specification style (before ``Dim.AUTO`` was introduced), is that the specification would often be mismatched with what was produced by model tracing.
+# That would lead to ``ConstraintViolation`` errors and export suggested fixes - see for example with this model & specification, where the model inherently requires equality between
+# dimensions 0 of ``x`` and ``y``, and requires dimension 1 to be static.
+
+class Foo(torch.nn.Module):
+    def forward(self, x, y):
+        w = x + y
+        return w + torch.ones(4)
+
+dx, dy, d1 = torch.export.dims("dx", "dy", "d1")
+try:
+    ep = export(
+        Foo(),
+        (torch.randn(6, 4), torch.randn(6, 4)),
+        dynamic_shapes={
+            "x": (dx, d1),
+            "y": (dy, d1),
+        },
+    )
+except Exception:
+    tb.print_exc()
+
+######################################################################
+# The expectation with suggested fixes is that the user can interactively copy-paste the changes into their dynamic shapes specification, and successfully export afterwards.
+#
+# Lastly, there's couple nice-to-knows about the options for specification:
+#
+# - ``None`` is a good option for static behavior:
+#   - ``dynamic_shapes=None`` (default) exports with the entire model being static.
+#   - specifying ``None`` at an input-level exports with all tensor dimensions static, and is also required for non-tensor inputs.
+#   - specifying ``None`` at a dimension-level specializes that dimension, though this is deprecated in favor of ``Dim.STATIC``.
+# - specifying per-dimension integer values also produces static behavior, and will additionally check that the provided sample input matches the specification.
+#
+# These options are combined in the inputs & dynamic shapes spec below:
+
+inputs = (
+    torch.randn(4, 4),
+    torch.randn(3, 3),
+    16,
+    False,
+)
+dynamic_shapes = {
+    "tensor_0": (Dim.AUTO, None),
+    "tensor_1": None,
+    "int_val": None,
+    "bool_val": None,
+}
+
+######################################################################
+# Data-dependent errors
+# ---------------------
+#
+# While trying to export models, you have may have encountered errors like "Could not guard on data-dependent expression", or Could not extract specialized integer from data-dependent expression".
+# These errors exist because ``torch.export()`` compiles programs using FakeTensors, which symbolically represent their real tensor counterparts. While these have equivalent symbolic properties
+# (e.g. sizes, strides, dtypes), they diverge in that FakeTensors do not contain any data values. While this avoids unnecessary memory usage and expensive computation, it does mean that export may be
+# unable to out-of-the-box compile parts of user code where compilation relies on data values. In short, if the compiler requires a concrete, data-dependent value in order to proceed, it will error out,
+# complaining that the value is not available.
+#
+# Data-dependent values appear in many places, and common sources are calls like ``item()``, ``tolist()``, or ``torch.unbind()`` that extract scalar values from tensors.
+# How are these values represented in the exported program? In the `Constraints/Dynamic Shapes <https://pytorch.org/tutorials/intermediate/torch_export_tutorial.html#constraints-dynamic-shapes>`_
+# section, we talked about allocating symbols to represent dynamic input dimensions.
+# The same happens here: we allocate symbols for every data-dependent value that appears in the program. The important distinction is that these are "unbacked" symbols,
+# in contrast to the "backed" symbols allocated for input dimensions. The `"backed/unbacked" <https://pytorch.org/docs/main/export.programming_model.html#basics-of-symbolic-shapes>`_
+# nomenclature refers to the presence/absence of a "hint" for the symbol: a concrete value backing the symbol, that can inform the compiler on how to proceed.
+#
+# In the input shape symbol case (backed symbols), these hints are simply the sample input shapes provided, which explains why control-flow branching is determined by the sample input properties.
+# For data-dependent values, the symbols are taken from FakeTensor "data" during tracing, and so the compiler doesn't know the actual values (hints) that these symbols would take on.
+#
+# Let's see how these show up in exported programs:
+
+class Foo(torch.nn.Module):
+    def forward(self, x, y):
+        a = x.item()
+        b = y.tolist()
+        return b + [a]
+
+inps = (
+    torch.tensor(1),
+    torch.tensor([2, 3]),
+)
+ep = export(Foo(), inps)
+print(ep)
+
+######################################################################
+# The result is that 3 unbacked symbols (notice they're prefixed with "u", instead of the usual "s" for input shape/backed symbols) are allocated and returned:
+# 1 for the ``item()`` call, and 1 for each of the elements of ``y`` with the ``tolist()`` call.
+# Note from the range constraints field that these take on ranges of ``[-int_oo, int_oo]``, not the default ``[0, int_oo]`` range allocated to input shape symbols,
+# since we have no information on what these values are - they don't represent sizes, so don't necessarily have positive values.
+
+######################################################################
+# Guards, torch._check()
+# ^^^^^^^^^^^^^^^^^^^^^^
+#
+# But the case above is easy to export, because the concrete values of these symbols aren't used in any compiler decision-making; all that's relevant is that the return values are unbacked symbols.
+# The data-dependent errors highlighted in this section are cases like the following, where `data-dependent guards <https://pytorch.org/docs/main/export.programming_model.html#control-flow-static-vs-dynamic>`_ are encountered:
+
+class Foo(torch.nn.Module):
+    def forward(self, x, y):
+        a = x.item()
+        if a // 2 >= 5:
+            return y + 2
+        else:
+            return y * 5
+
+######################################################################
+# Here we actually need the "hint", or the concrete value of ``a`` for the compiler to decide whether to trace ``return y + 2`` or ``return y * 5`` as the output.
+# Because we trace with FakeTensors, we don't know what ``a // 2 >= 5`` actually evaluates to, and export errors out with "Could not guard on data-dependent expression ``u0 // 2 >= 5 (unhinted)``".
+#
+# So how do we export this toy model? Unlike ``torch.compile()``, export requires full graph compilation, and we can't just graph break on this. Here are some basic options:
+#
+# 1. Manual specialization: we could intervene by selecting the branch to trace, either by removing the control-flow code to contain only the specialized branch, or using ``torch.compiler.is_compiling()`` to guard what's traced at compile-time.
+# 2. ``torch.cond()``: we could rewrite the control-flow code to use ``torch.cond()`` so we don't specialize on a branch.
+#
+# While these options are valid, they have their pitfalls. Option 1 sometimes requires drastic, invasive rewrites of the model code to specialize, and ``torch.cond()`` is not a comprehensive system for handling data-dependent errors.
+# As we will see, there are data-dependent errors that do not involve control-flow.
+#
+# The generally recommended approach is to start with ``torch._check()`` calls. While these give the impression of purely being assert statements, they are in fact a system of informing the compiler on properties of symbols.
+# While a ``torch._check()`` call does act as an assertion at runtime, when traced at compile-time, the checked expression is sent to the symbolic shapes subsystem for reasoning, and any symbol properties that follow from the expression being true,
+# are stored as symbol properties (provided it's smart enough to infer those properties). So even if unbacked symbols don't have hints, if we're able to communicate properties that are generally true for these symbols via
+# ``torch._check()`` calls, we can potentially bypass data-dependent guards without rewriting the offending model code.
+#
+# For example in the model above, inserting ``torch._check(a >= 10)`` would tell the compiler that ``y + 2`` can always be returned, and ``torch._check(a == 4)`` tells it to return ``y * 5``.
+# See what happens when we re-export this model.
+
+class Foo(torch.nn.Module):
+    def forward(self, x, y):
+        a = x.item()
+        torch._check(a >= 10)
+        torch._check(a <= 60)
+        if a // 2 >= 5:
+            return y + 2
+        else:
+            return y * 5
+
+inps = (
+    torch.tensor(32),
+    torch.randn(4),
+)
+ep = export(Foo(), inps)
+print(ep)
+
+######################################################################
+# Export succeeds, and note from the range constraints field that ``u0`` takes on a range of ``[10, 60]``.
+#
+# So what information do ``torch._check()`` calls actually communicate? This varies as the symbolic shapes subsystem gets smarter, but at a fundamental level, these are generally true:
+#
+# 1. Equality with non-data-dependent expressions: ``torch._check()`` calls that communicate equalities like ``u0 == s0 + 4`` or ``u0 == 5``.
+# 2. Range refinement: calls that provide lower or upper bounds for symbols, like the above.
+# 3. Some basic reasoning around more complicated expressions: inserting ``torch._check(a < 4)`` will typically tell the compiler that ``a >= 4`` is false. Checks on complex expressions like ``torch._check(a ** 2 - 3 * a <= 10)`` will typically get you past identical guards.
+#
+# As mentioned previously, ``torch._check()`` calls have applicability outside of data-dependent control flow. For example, here's a model where ``torch._check()`` insertion
+# prevails while manual specialization & ``torch.cond()`` do not:
+
+class Foo(torch.nn.Module):
+    def forward(self, x, y):
+        a = x.item()
+        return y[a]
+
+inps = (
+    torch.tensor(32),
+    torch.randn(60),
+)
+try:
+    export(Foo(), inps)
+except Exception:
+    tb.print_exc()
+
+######################################################################
+# Here is a scenario where ``torch._check()`` insertion is required simply to prevent an operation from failing. The export call will fail with
+# "Could not guard on data-dependent expression ``-u0 > 60``", implying that the compiler doesn't know if this is a valid indexing operation -
+# if the value of ``x`` is out-of-bounds for ``y`` or not. Here, manual specialization is too prohibitive, and ``torch.cond()`` has no place.
+# Instead, informing the compiler of ``u0``'s range is sufficient:
+
+class Foo(torch.nn.Module):
+    def forward(self, x, y):
+        a = x.item()
+        torch._check(a >= 0)
+        torch._check(a < y.shape[0])
+        return y[a]
+
+inps = (
+    torch.tensor(32),
+    torch.randn(60),
+)
+ep = export(Foo(), inps)
+print(ep)
+
+######################################################################
+# Specialized values
+# ^^^^^^^^^^^^^^^^^^
+#
+# Another category of data-dependent error happens when the program attempts to extract a concrete data-dependent integer/float value
+# while tracing. This looks something like "Could not extract specialized integer from data-dependent expression", and is analogous to
+# the previous class of errors - if these occur when attempting to evaluate concrete integer/float values, data-dependent guard errors arise
+# with evaluating concrete boolean values.
+#
+# This error typically occurs when there is an explicit or implicit ``int()`` cast on a data-dependent expression. For example, this list comprehension
+# has a `range()` call that implicitly does an ``int()`` cast on the size of the list:
+
+class Foo(torch.nn.Module):
+    def forward(self, x, y):
+        a = x.item()
+        b = torch.cat([y for y in range(a)], dim=0)
+        return b + int(a)
+
+inps = (
+    torch.tensor(32),
+    torch.randn(60),
+)
+try:
+    export(Foo(), inps, strict=False)
+except Exception:
+    tb.print_exc()
+
+######################################################################
+# For these errors, some basic options you have are:
+#
+# 1. Avoid unnecessary ``int()`` cast calls, in this case the ``int(a)`` in the return statement.
+# 2. Use ``torch._check()`` calls; unfortunately all you may be able to do in this case is specialize (with ``torch._check(a == 60)``).
+# 3. Rewrite the offending code at a higher level. For example, the list comprehension is semantically a ``repeat()`` op, which doesn't involve an ``int()`` cast. The following rewrite avoids data-dependent errors:
+
+class Foo(torch.nn.Module):
+    def forward(self, x, y):
+        a = x.item()
+        b = y.unsqueeze(0).repeat(a, 1)
+        return b + a
+
+inps = (
+    torch.tensor(32),
+    torch.randn(60),
+)
+ep = export(Foo(), inps, strict=False)
+print(ep)
+
+######################################################################
+# Data-dependent errors can be much more involved, and there are many more options in your toolkit to deal with them: ``torch._check_is_size()``, ``guard_size_oblivious()``, or real-tensor tracing, as starters.
+# For more in-depth guides, please refer to the `Export Programming Model <https://pytorch.org/docs/main/export.programming_model.html>`_,
+# or `Dealing with GuardOnDataDependentSymNode errors <https://docs.google.com/document/d/1HSuTTVvYH1pTew89Rtpeu84Ht3nQEFTYhAX3Ypa_xJs>`_.
+
+######################################################################
+# Custom Ops
+# ----------
+#
+# ``torch.export`` can export PyTorch programs with custom operators. Please
+# refer to `this page <https://pytorch.org/tutorials/advanced/custom_ops_landing_page.html>`__
+# on how to author a custom operator in either C++ or Python.
+#
+# The following is an example of registering a custom operator in python to be
+# used by ``torch.export``. The important thing to note is that the custom op
+# must have a `FakeTensor kernel <https://docs.google.com/document/d/1_W62p8WJOQQUzPsJYa7s701JXt0qf2OfLub2sbkHOaU/edit?tab=t.0#heading=h.xvrg7clz290>`__.
+
+@torch.library.custom_op("my_custom_library::custom_op", mutates_args={})
+def custom_op(x: torch.Tensor) -> torch.Tensor:
+    print("custom_op called!")
+    return torch.relu(x)
+
+@custom_op.register_fake
+def custom_op_meta(x):
+    # Returns an empty tensor with the same shape as the expected output
+    return torch.empty_like(x)
+
+######################################################################
+# Here is an example of exporting a program with the custom op.
+
+class CustomOpExample(torch.nn.Module):
+    def forward(self, x):
+        x = torch.sin(x)
+        x = torch.ops.my_custom_library.custom_op(x)
+        x = torch.cos(x)
+        return x
+
+exported_custom_op_example = export(CustomOpExample(), (torch.randn(3, 3),))
+print(exported_custom_op_example)
+print(exported_custom_op_example.module()(torch.randn(3, 3)))
+
+######################################################################
+# Note that in the ``ExportedProgram``, the custom operator is included in the graph.
+
+######################################################################
+# IR/Decompositions
+# -----------------
+#
+# The graph produced by ``torch.export`` returns a graph containing only
+# `ATen operators <https://pytorch.org/cppdocs/#aten>`__, which are the
+# basic unit of computation in PyTorch. As there are over 3000 ATen operators,
+# export provides a way to narrow down the operator set used in the graph based
+# on certain characteristics, creating different IRs.
+#
+# By default, export produces the most generic IR which contains all ATen
+# operators, including both functional and non-functional operators. A functional
+# operator is one that does not contain any mutations or aliasing of the inputs.
+# You can find a list of all ATen operators
+# `here <https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/native_functions.yaml>`__
+# and you can inspect if an operator is functional by checking
+# ``op._schema.is_mutable``, for example:
+
+print(torch.ops.aten.add.Tensor._schema.is_mutable)
+print(torch.ops.aten.add_.Tensor._schema.is_mutable)
+
+######################################################################
+# This generic IR can be used to train in eager PyTorch Autograd. This IR can be
+# more explicitly reached through the API ``torch.export.export_for_training``,
+# which was introduced in PyTorch 2.5, but calling ``torch.export.export``
+# should produce the same graph as of PyTorch 2.6.
+
+class DecompExample(torch.nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+        self.conv = torch.nn.Conv2d(1, 3, 1, 1)
+        self.bn = torch.nn.BatchNorm2d(3)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        return (x,)
+
+ep_for_training = torch.export.export_for_training(DecompExample(), (torch.randn(1, 1, 3, 3),))
+print(ep_for_training.graph)
+
+######################################################################
+# We can then lower this exported program to an operator set which only contains
+# functional ATen operators through the API ``run_decompositions``, which
+# decomposes the ATen operators into the ones specified in the decomposition
+# table, and functionalizes the graph. By specifying an empty set, we're only
+# performing functionalization, and does not do any additional decompositions.
+# This results in an IR which contains ~2000 operators (instead of the 3000
+# operators above), and is ideal for inference cases.
+
+ep_for_inference = ep_for_training.run_decompositions(decomp_table={})
+print(ep_for_inference.graph)
+
+######################################################################
+# As we can see, the previously mutable operator,
+# ``torch.ops.aten.add_.default`` has now been replaced with
+# ``torch.ops.aten.add.default``, a l operator.
+
+######################################################################
+# We can also further lower this exported program to an operator set which only
+# contains the
+# `Core ATen Operator Set <https://pytorch.org/docs/main/torch.compiler_ir.html#core-aten-ir>`__,
+# which is a collection of only ~180 operators. This IR is optimal for backends
+# who do not want to reimplement all ATen operators.
+
+from torch.export import default_decompositions
+
+core_aten_decomp_table = default_decompositions()
+core_aten_ep = ep_for_training.run_decompositions(decomp_table=core_aten_decomp_table)
+print(core_aten_ep.graph)
+
+######################################################################
+# We now see that ``torch.ops.aten.conv2d.default`` has been decomposed
+# into ``torch.ops.aten.convolution.default``. This is because ``convolution``
+# is a more "core" operator, as operations like ``conv1d`` and ``conv2d`` can be
+# implemented using the same op.
+
+######################################################################
+# We can also specify our own decomposition behaviors:
+
+my_decomp_table = torch.export.default_decompositions()
+
+def my_awesome_custom_conv2d_function(x, weight, bias, stride=[1, 1], padding=[0, 0], dilation=[1, 1], groups=1):
+    return 2 * torch.ops.aten.convolution(x, weight, bias, stride, padding, dilation, False, [0, 0], groups)
+
+my_decomp_table[torch.ops.aten.conv2d.default] = my_awesome_custom_conv2d_function
+my_ep = ep_for_training.run_decompositions(my_decomp_table)
+print(my_ep.graph)
+
+######################################################################
+# Notice that instead of ``torch.ops.aten.conv2d.default`` being decomposed
+# into ``torch.ops.aten.convolution.default``, it is now decomposed into
+# ``torch.ops.aten.convolution.default`` and ``torch.ops.aten.mul.Tensor``,
+# which matches our custom decomposition rule.
+
+######################################################################
+# ExportDB
+# --------
+#
+# ``torch.export`` will only ever export a single computation graph from a PyTorch program. Because of this requirement,
+# there will be Python or PyTorch features that are not compatible with ``torch.export``, which will require users to
+# rewrite parts of their model code. We have seen examples of this earlier in the tutorial -- for example, rewriting
+# if-statements using ``cond``.
+#
+# `ExportDB <https://pytorch.org/docs/main/generated/exportdb/index.html>`__ is the standard reference that documents
+# supported and unsupported Python/PyTorch features for ``torch.export``. It is essentially a list a program samples, each
+# of which represents the usage of one particular Python/PyTorch feature and its interaction with ``torch.export``.
+# Examples are also tagged by category so that they can be more easily searched.
+#
+# For example, let's use ExportDB to get a better understanding of how the predicate works in the ``cond`` operator.
+# We can look at the example called ``cond_predicate``, which has a ``torch.cond`` tag. The example code looks like:
+
+def cond_predicate(x):
+    """
+    The conditional statement (aka predicate) passed to ``cond()`` must be one of the following:
+    - ``torch.Tensor`` with a single element
+    - boolean expression
+    NOTE: If the `pred` is test on a dim with batch size < 2, it will be specialized.
+    """
+    pred = x.dim() > 2 and x.shape[2] > 10
+    return cond(pred, lambda x: x.cos(), lambda y: y.sin(), [x])
+
+######################################################################
+# More generally, ExportDB can be used as a reference when one of the following occurs:
+#
+# 1. Before attempting ``torch.export``, you know ahead of time that your model uses some tricky Python/PyTorch features
+#    and you want to know if ``torch.export`` covers that feature.
+# 2. When attempting ``torch.export``, there is a failure and it's unclear how to work around it.
+#
+# ExportDB is not exhaustive, but is intended to cover all use cases found in typical PyTorch code. Feel free to reach
+# out if there is an important Python/PyTorch feature that should be added to ExportDB or supported by ``torch.export``.
+
+######################################################################
+# Running the Exported Program
+# ----------------------------
+#
+# As ``torch.export`` is only a graph capturing mechanism, calling the artifact
+# produced by ``torch.export`` eagerly will be equivalent to running the eager
+# module. To optimize the execution of the Exported Program, we can pass this
+# exported artifact to backends such as Inductor through ``torch.compile``,
+# `AOTInductor <https://pytorch.org/docs/main/torch.compiler_aot_inductor.html>`__,
+# or `TensorRT <https://pytorch.org/TensorRT/dynamo/dynamo_export.html>`__.
+
+class M(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.linear = torch.nn.Linear(3, 3)
+
+    def forward(self, x):
+        x = self.linear(x)
+        return x
+
+inp = torch.randn(2, 3, device="cuda")
+m = M().to(device="cuda")
+ep = torch.export.export(m, (inp,))
+
+# Run it eagerly
+res = ep.module()(inp)
+print(res)
+
+# Run it with torch.compile
+res = torch.compile(ep.module(), backend="inductor")(inp)
+print(res)
+
+######################################################################
+# .. code-block:: python
+#
+#    import torch._inductor
+#
+#    # Note: these APIs are subject to change
+#    # Compile the exported program to a PT2 archive using ``AOTInductor``
+#    with torch.no_grad():
+#        pt2_path = torch._inductor.aoti_compile_and_package(ep)
+#
+#    # Load and run the .so file in Python.
+#    # To load and run it in a C++ environment, see:
+#    # https://pytorch.org/docs/main/torch.compiler_aot_inductor.html
+#    aoti_compiled = torch._inductor.aoti_load_package(pt2_path)
+#    res = aoti_compiled(inp)
+
+######################################################################
+# Conclusion
+# ----------
+#
+# We introduced ``torch.export``, the new PyTorch 2.X way to export single computation
+# graphs from PyTorch programs. In particular, we demonstrate several code modifications
+# and considerations (control flow ops, constraints, etc.) that need to be made in order to export a graph.
diff --git a/intermediate_source/torchrec_intro_tutorial.py b/intermediate_source/torchrec_intro_tutorial.py
new file mode 100644
index 00000000000..81b7663c110
--- /dev/null
+++ b/intermediate_source/torchrec_intro_tutorial.py
@@ -0,0 +1,950 @@
+"""
+Introduction to TorchRec
+==================================
+
+**TorchRec** is a PyTorch library tailored for building scalable and efficient recommendation systems using embeddings.
+This tutorial guides you through the installation process, introduces the concept of embeddings, and highlights their importance in
+recommendation systems. It offers practical demonstrations on implementing embeddings with PyTorch
+and TorchRec, focusing on handling large embedding tables through distributed training and advanced optimizations.
+
+.. grid:: 2
+
+    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn
+       :class-card: card-prerequisites
+
+       * Fundamentals of embeddings and their role in recommendation systems
+       * How to set up TorchRec to manage and implement embeddings in PyTorch environments
+       * Explore advanced techniques for distributing large embedding tables across multiple GPUs
+
+    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites
+       :class-card: card-prerequisites
+
+       * PyTorch v2.5 or later with CUDA 11.8 or later
+       * Python 3.9 or later
+       * `FBGEMM <https://github.com/pytorch/fbgemm>`__
+
+
+"""
+
+###############################################
+# Install Dependencies
+# ^^^^^^^^^^^^^^^^^^^^
+#
+# Before running this tutorial in Google Colab, make sure to install the
+# following dependencies:
+#
+# .. code-block:: sh
+#
+#    !pip3 install --pre torch --index-url https://download.pytorch.org/whl/cu121 -U
+#    !pip3 install fbgemm_gpu --index-url https://download.pytorch.org/whl/cu121
+#    !pip3 install torchmetrics==1.0.3
+#    !pip3 install torchrec --index-url https://download.pytorch.org/whl/cu121
+#
+# .. note::
+#    If you are running this in Google Colab, make sure to switch to a GPU runtime type.
+#    For more information,
+#    see `Enabling CUDA <https://pytorch.org/tutorials/beginner/colab#enabling-cuda>`__
+#
+
+
+######################################################################
+# Embeddings
+# ~~~~~~~~~~
+#
+# When building recommendation systems, categorical features typically
+# have massive cardinality, posts, users, ads, and so on.
+#
+# In order to represent these entities and model these relationships,
+# **embeddings** are used. In machine learning, **embeddings are a vectors
+# of real numbers in a high-dimensional space used to represent meaning in
+# complex data like words, images, or users**.
+#
+# Embeddings in RecSys
+# ~~~~~~~~~~~~~~~~~~~~
+#
+# Now you might wonder, how are these embeddings generated in the first
+# place? Well, embeddings are represented as individual rows in an
+# **Embedding Table**, also referred to as embedding weights. The reason
+# for this is that embeddings or embedding table weights are trained just
+# like all of the other weights of the model via gradient descent!
+#
+# Embedding tables are simply a large matrix for storing embeddings, with
+# two dimensions (B, N), where:
+#
+# * B is the number of embeddings stored by the table
+# * N is the number of dimensions per embedding (N-dimensional embedding).
+#
+# The inputs to embedding tables represent embedding lookups to retrieve
+# the embedding for a specific index or row. In recommendation systems, such
+# as those used in many large systems, unique IDs are not only used for
+# specific users, but also across entities like posts and ads to serve as
+# lookup indices to respective embedding tables!
+#
+# Embeddings are trained in RecSys through the following process:
+#
+# * **Input/lookup indices are fed into the model, as unique IDs**. IDs are
+#   hashed to the total size of the embedding table to prevent issues when
+#   the ID > number of rows
+#
+# * Embeddings are then retrieved and **pooled, such as taking the sum or
+#   mean of the embeddings**. This is required as there can be a variable number of
+#   embeddings per example while the model expects consistent shapes.
+#
+# * The **embeddings are used in conjunction with the rest of the model to
+#   produce a prediction**, such as `Click-Through Rate
+#   (CTR) <https://support.google.com/google-ads/answer/2615875?hl=en>`__
+#   for an ad.
+#
+# * The loss is calculated with the prediction and the label
+#   for an example, and **all weights of the model are updated through
+#   gradient descent and backpropagation, including the embedding weights**
+#   that were associated with the example.
+#
+# These embeddings are crucial for representing categorical features, such
+# as users, posts, and ads, in order to capture relationships and make
+# good recommendations. The `Deep learning recommendation
+# model <https://arxiv.org/abs/1906.00091>`__ (DLRM) paper talks more
+# about the technical details of using embedding tables in RecSys.
+#
+# This tutorial introduces the concept of embeddings, showcase
+# TorchRec specific modules and data types, and depict how distributed training
+# works with TorchRec.
+#
+
+import torch
+
+
+######################################################################
+# Embeddings in PyTorch
+# ---------------------
+#
+# In PyTorch, we have the following types of embeddings:
+#
+# * :class:`torch.nn.Embedding`: An embedding table where forward pass returns the
+#   embeddings themselves as is.
+#
+# * :class:`torch.nn.EmbeddingBag`: Embedding table where forward pass returns
+#   embeddings that are then pooled, for example, sum or mean, otherwise known
+#   as **Pooled Embeddings**.
+#
+# In this section, we will go over a very brief introduction to performing
+# embedding lookups by passing in indices into the table.
+#
+
+num_embeddings, embedding_dim = 10, 4
+
+# Initialize our embedding table
+weights = torch.rand(num_embeddings, embedding_dim)
+print("Weights:", weights)
+
+# Pass in pre-generated weights just for example, typically weights are randomly initialized
+embedding_collection = torch.nn.Embedding(
+    num_embeddings, embedding_dim, _weight=weights
+)
+embedding_bag_collection = torch.nn.EmbeddingBag(
+    num_embeddings, embedding_dim, _weight=weights
+)
+
+# Print out the tables, we should see the same weights as above
+print("Embedding Collection Table: ", embedding_collection.weight)
+print("Embedding Bag Collection Table: ", embedding_bag_collection.weight)
+
+# Lookup rows (ids for embedding ids) from the embedding tables
+# 2D tensor with shape (batch_size, ids for each batch)
+ids = torch.tensor([[1, 3]])
+print("Input row IDS: ", ids)
+
+embeddings = embedding_collection(ids)
+
+# Print out the embedding lookups
+# You should see the specific embeddings be the same as the rows (ids) of the embedding tables above
+print("Embedding Collection Results: ")
+print(embeddings)
+print("Shape: ", embeddings.shape)
+
+# ``nn.EmbeddingBag`` default pooling is mean, so should be mean of batch dimension of values above
+pooled_embeddings = embedding_bag_collection(ids)
+
+print("Embedding Bag Collection Results: ")
+print(pooled_embeddings)
+print("Shape: ", pooled_embeddings.shape)
+
+# ``nn.EmbeddingBag`` is the same as ``nn.Embedding`` but just with pooling (mean, sum, and so on)
+# We can see that the mean of the embeddings of embedding_collection is the same as the output of the embedding_bag_collection
+print("Mean: ", torch.mean(embedding_collection(ids), dim=1))
+
+
+######################################################################
+# Congratulations! Now you have a basic understanding of how to use
+# embedding tables --- one of the foundations of modern recommendation
+# systems! These tables represent entities and their relationships. For
+# example, the relationship between a given user and the pages and posts
+# they have liked.
+#
+
+
+######################################################################
+# TorchRec Features Overview
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+#
+# In the section above we've learned how to use embedding tables, one of the foundations of
+# modern recommendation systems! These tables represent entities and
+# relationships, such as users, pages, posts, etc. Given that these
+# entities are always increasing, a **hash** function is typically applied
+# to make sure the IDs are within the bounds of a certain embedding table.
+# However, in order to represent a vast amount of entities and reduce hash
+# collisions, these tables can become quite massive (think about the number of ads
+# for example). In fact, these tables can become so massive that they
+# won't be able to fit on 1 GPU, even with 80G of memory.
+#
+# In order to train models with massive embedding tables, sharding these
+# tables across GPUs is required, which then introduces a whole new set of
+# problems and opportunities in parallelism and optimization. Luckily, we have
+# the TorchRec library <https://docs.pytorch.org/torchrec/overview.html>`__ that has encountered, consolidated, and addressed
+# many of these concerns. TorchRec serves as a **library that provides
+# primitives for large scale distributed embeddings**.
+#
+# Next, we will explore the major features of the TorchRec
+# library. We will start with ``torch.nn.Embedding`` and will extend that to
+# custom TorchRec modules, explore distributed training environment with
+# generating a sharding plan for embeddings, look at inherent TorchRec
+# optimizations, and extend the model to be ready for inference in C++.
+# Below is a quick outline of what this section consists of:
+#
+# * TorchRec Modules and Data Types
+# * Distributed Training, Sharding, and Optimizations
+#
+# Let's begin with importing TorchRec:
+
+import torchrec
+
+
+######################################################################
+# TorchRec Modules and Data Types
+# ----------------------------------
+#
+# This section goes over TorchRec Modules and data types including such
+# entities as ``EmbeddingCollection`` and ``EmbeddingBagCollection``,
+# ``JaggedTensor``, ``KeyedJaggedTensor``, ``KeyedTensor`` and more.
+#
+# From ``EmbeddingBag`` to ``EmbeddingBagCollection``
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# We have already explored :class:`torch.nn.Embedding` and :class:`torch.nn.EmbeddingBag`.
+# TorchRec extends these modules by creating collections of embeddings, in
+# other words modules that can have multiple embedding tables, with
+# ``EmbeddingCollection`` and ``EmbeddingBagCollection``
+# We will use ``EmbeddingBagCollection`` to represent a group of
+# embedding bags.
+#
+# In the example code below, we create an ``EmbeddingBagCollection`` (EBC)
+# with two embedding bags, 1 representing **products** and 1 representing **users**.
+# Each table, ``product_table`` and ``user_table``, is represented by a 64 dimension
+# embedding of size 4096.
+#
+
+ebc = torchrec.EmbeddingBagCollection(
+    device="cpu",
+    tables=[
+        torchrec.EmbeddingBagConfig(
+            name="product_table",
+            embedding_dim=64,
+            num_embeddings=4096,
+            feature_names=["product"],
+            pooling=torchrec.PoolingType.SUM,
+        ),
+        torchrec.EmbeddingBagConfig(
+            name="user_table",
+            embedding_dim=64,
+            num_embeddings=4096,
+            feature_names=["user"],
+            pooling=torchrec.PoolingType.SUM,
+        ),
+    ],
+)
+print(ebc.embedding_bags)
+
+
+######################################################################
+# Let’s inspect the forward method for ``EmbeddingBagCollection`` and the
+# module’s inputs and outputs:
+#
+
+import inspect
+
+# Let's look at the ``EmbeddingBagCollection`` forward method
+# What is a ``KeyedJaggedTensor`` and ``KeyedTensor``?
+print(inspect.getsource(ebc.forward))
+
+
+######################################################################
+# TorchRec Input/Output Data Types
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# TorchRec has distinct data types for input and output of its modules:
+# ``JaggedTensor``, ``KeyedJaggedTensor``, and ``KeyedTensor``. Now you
+# might ask, why create new data types to represent sparse features? To
+# answer that question, we must understand how sparse features are
+# represented in code.
+#
+# Sparse features are otherwise known as ``id_list_feature`` and
+# ``id_score_list_feature``, and are the **IDs** that will be used as
+# indices to an embedding table to retrieve the embedding for that ID. To
+# give a very simple example, imagine a single sparse feature being Ads
+# that a user interacted with. The input itself would be a set of Ad IDs
+# that a user interacted with, and the embeddings retrieved would be a
+# semantic representation of those Ads. The tricky part of representing
+# these features in code is that in each input example, **the number of
+# IDs is variable**. One day a user might have interacted with only one ad
+# while the next day they interact with three.
+#
+# A simple representation is shown below, where we have a ``lengths``
+# tensor denoting how many indices are in an example for a batch and a
+# ``values`` tensor containing the indices themselves.
+#
+
+# Batch Size 2
+# 1 ID in example 1, 2 IDs in example 2
+id_list_feature_lengths = torch.tensor([1, 2])
+
+# Values (IDs) tensor: ID 5 is in example 1, ID 7, 1 is in example 2
+id_list_feature_values = torch.tensor([5, 7, 1])
+
+
+######################################################################
+# Next, let's look at the offsets as well as what is contained in each batch
+#
+
+# Lengths can be converted to offsets for easy indexing of values
+id_list_feature_offsets = torch.cumsum(id_list_feature_lengths, dim=0)
+
+print("Offsets: ", id_list_feature_offsets)
+print("First Batch: ", id_list_feature_values[: id_list_feature_offsets[0]])
+print(
+    "Second Batch: ",
+    id_list_feature_values[id_list_feature_offsets[0] : id_list_feature_offsets[1]],
+)
+
+from torchrec import JaggedTensor
+
+# ``JaggedTensor`` is just a wrapper around lengths/offsets and values tensors!
+jt = JaggedTensor(values=id_list_feature_values, lengths=id_list_feature_lengths)
+
+# Automatically compute offsets from lengths
+print("Offsets: ", jt.offsets())
+
+# Convert to list of values
+print("List of Values: ", jt.to_dense())
+
+# ``__str__`` representation
+print(jt)
+
+from torchrec import KeyedJaggedTensor
+
+# ``JaggedTensor`` represents IDs for 1 feature, but we have multiple features in an ``EmbeddingBagCollection``
+# That's where ``KeyedJaggedTensor`` comes in! ``KeyedJaggedTensor`` is just multiple ``JaggedTensors`` for multiple id_list_feature_offsets
+# From before, we have our two features "product" and "user". Let's create ``JaggedTensors`` for both!
+
+product_jt = JaggedTensor(
+    values=torch.tensor([1, 2, 1, 5]), lengths=torch.tensor([3, 1])
+)
+user_jt = JaggedTensor(values=torch.tensor([2, 3, 4, 1]), lengths=torch.tensor([2, 2]))
+
+# Q1: How many batches are there, and which values are in the first batch for ``product_jt`` and ``user_jt``?
+kjt = KeyedJaggedTensor.from_jt_dict({"product": product_jt, "user": user_jt})
+
+# Look at our feature keys for the ``KeyedJaggedTensor``
+print("Keys: ", kjt.keys())
+
+# Look at the overall lengths for the ``KeyedJaggedTensor``
+print("Lengths: ", kjt.lengths())
+
+# Look at all values for ``KeyedJaggedTensor``
+print("Values: ", kjt.values())
+
+# Can convert ``KeyedJaggedTensor`` to dictionary representation
+print("to_dict: ", kjt.to_dict())
+
+# ``KeyedJaggedTensor`` string representation
+print(kjt)
+
+# Q2: What are the offsets for the ``KeyedJaggedTensor``?
+
+# Now we can run a forward pass on our ``EmbeddingBagCollection`` from before
+result = ebc(kjt)
+result
+
+# Result is a ``KeyedTensor``, which contains a list of the feature names and the embedding results
+print(result.keys())
+
+# The results shape is [2, 128], as batch size of 2. Reread previous section if you need a refresher on how the batch size is determined
+# 128 for dimension of embedding. If you look at where we initialized the ``EmbeddingBagCollection``, we have two tables "product" and "user" of dimension 64 each
+# meaning embeddings for both features are of size 64. 64 + 64 = 128
+print(result.values().shape)
+
+# Nice to_dict method to determine the embeddings that belong to each feature
+result_dict = result.to_dict()
+for key, embedding in result_dict.items():
+    print(key, embedding.shape)
+
+
+######################################################################
+# Congrats! You now understand TorchRec modules and data types.
+# Give yourself a pat on the back for making it this far. Next, we will
+# learn about distributed training and sharding.
+#
+
+
+######################################################################
+# Distributed Training and Sharding
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# Now that we have a grasp on TorchRec modules and data types, it's time
+# to take it to the next level.
+#
+# Remember, the main purpose of TorchRec is to provide primitives for
+# distributed embeddings. So far, we've only worked with embedding tables
+# on a single device. This has been possible given how small the embedding tables
+# have been, but in a production setting this isn't generally the case.
+# Embedding tables often get massive, where one table can't fit on a single
+# GPU, creating the requirement for multiple devices and a distributed
+# environment.
+#
+# In this section, we will explore setting up a distributed environment,
+# exactly how actual production training is done, and explore sharding
+# embedding tables, all with TorchRec.
+#
+# **This section will also only use 1 GPU, though it will be treated in a
+# distributed fashion. This is only a limitation for training, as training
+# has a process per GPU. Inference does not run into this requirement**
+#
+# In the example code below, we set up our PyTorch distributed environment.
+#
+# .. warning::
+#    If you are running this in Google Colab, you can only call this cell once,
+#    calling it again will cause an error as you can only initialize the process
+#    group once.
+
+import os
+
+import torch.distributed as dist
+
+# Set up environment variables for distributed training
+# RANK is which GPU we are on, default 0
+os.environ["RANK"] = "0"
+# How many devices in our "world", colab notebook can only handle 1 process
+os.environ["WORLD_SIZE"] = "1"
+# Localhost as we are training locally
+os.environ["MASTER_ADDR"] = "localhost"
+# Port for distributed training
+os.environ["MASTER_PORT"] = "29500"
+
+# nccl backend is for GPUs, gloo is for CPUs
+dist.init_process_group(backend="gloo")
+
+print(f"Distributed environment initialized: {dist}")
+
+
+######################################################################
+# Distributed Embeddings
+# ~~~~~~~~~~~~~~~~~~~~~~
+#
+# We have already worked with the main TorchRec module:
+# ``EmbeddingBagCollection``. We have examined how it works along with how
+# data is represented in TorchRec. However, we have not yet explored one
+# of the main parts of TorchRec, which is **distributed embeddings**.
+#
+# GPUs are the most popular choice for ML workloads by far today, as they
+# are able to do magnitudes more floating point operations/s
+# (`FLOPs <https://en.wikipedia.org/wiki/FLOPS>`__) than CPU. However,
+# GPUs come with the limitation of scarce fast memory (HBM which is
+# analogous to RAM for CPU), typically, ~10s of GBs.
+#
+# A RecSys model can contain embedding tables that far exceed the memory
+# limit for 1 GPU, hence the need for distribution of the embedding tables
+# across multiple GPUs, otherwise known as **model parallel**. On the
+# other hand, **data parallel** is where the entire model is replicated on
+# each GPU, which each GPU taking in a distinct batch of data for
+# training, syncing gradients on the backwards pass.
+#
+# Parts of the model that **require less compute but more memory
+# (embeddings) are distributed with model parallel** while parts that
+# **require more compute and less memory (dense layers, MLP, etc.) are
+# distributed with data parallel**.
+#
+# Sharding
+# ~~~~~~~~
+#
+# In order to distribute an embedding table, we split up the embedding
+# table into parts and place those parts onto different devices, also
+# known as “sharding”.
+#
+# There are many ways to shard embedding tables. The most common ways are:
+#
+# * Table-Wise: the table is placed entirely onto one device
+# * Column-Wise: columns of embedding tables are sharded
+# * Row-Wise: rows of embedding tables are sharded
+#
+# Sharded Modules
+# ~~~~~~~~~~~~~~~
+#
+# While all of this seems like a lot to deal with and implement, you're in
+# luck. **TorchRec provides all the primitives for easy distributed
+# training and inference**! In fact, TorchRec modules have two corresponding
+# classes for working with any TorchRec module in a distributed
+# environment:
+#
+# * **The module sharder**: This class exposes a ``shard`` API
+#   that handles sharding a TorchRec Module, producing a sharded module.
+#   * For ``EmbeddingBagCollection``, the sharder is `EmbeddingBagCollectionSharder `
+# * **Sharded module**: This class is a sharded variant of a TorchRec module.
+#   It has the same input/output as a the regular TorchRec module, but much
+#   more optimized and works in a distributed environment.
+#   * For ``EmbeddingBagCollection``, the sharded variant is `ShardedEmbeddingBagCollection`
+#
+# Every TorchRec module has an unsharded and sharded variant.
+#
+# * The unsharded version is meant to be prototyped and experimented with.
+# * The sharded version is meant to be used in a distributed environment for
+#   distributed training and inference.
+#
+# The sharded versions of TorchRec modules, for example
+# ``EmbeddingBagCollection``, will handle everything that is needed for Model
+# Parallelism, such as communication between GPUs for distributing
+# embeddings to the correct GPUs.
+#
+# Refresher of our ``EmbeddingBagCollection`` module
+ebc
+
+from torchrec.distributed.embeddingbag import EmbeddingBagCollectionSharder
+from torchrec.distributed.planner import EmbeddingShardingPlanner, Topology
+from torchrec.distributed.types import ShardingEnv
+
+# Corresponding sharder for ``EmbeddingBagCollection`` module
+sharder = EmbeddingBagCollectionSharder()
+
+# ``ProcessGroup`` from torch.distributed initialized 2 cells above
+pg = dist.GroupMember.WORLD
+assert pg is not None, "Process group is not initialized"
+
+print(f"Process Group: {pg}")
+
+
+######################################################################
+# Planner
+# ~~~~~~~
+#
+# Before we can show how sharding works, we must know about the
+# **planner**, which helps us determine the best sharding configuration.
+#
+# Given a number of embedding tables and a number of ranks, there are many
+# different sharding configurations that are possible. For example, given
+# 2 embedding tables and 2 GPUs, you can:
+#
+# * Place 1 table on each GPU
+# * Place both tables on a single GPU and no tables on the other
+# * Place certain rows and columns on each GPU
+#
+# Given all of these possibilities, we typically want a sharding
+# configuration that is optimal for performance.
+#
+# That is where the planner comes in. The planner is able to determine
+# given the number of embedding tables and the number of GPUs, what is the optimal
+# configuration. Turns out, this is incredibly difficult to do manually,
+# with tons of factors that engineers have to consider to ensure an
+# optimal sharding plan. Luckily, TorchRec provides an auto planner when
+# the planner is used.
+#
+# The TorchRec planner:
+#
+# * Assesses memory constraints of hardware
+# * Estimates compute based on memory fetches as embedding lookups
+# * Addresses data specific factors
+# * Considers other hardware specifics like bandwidth to generate an optimal sharding plan
+#
+# In order to take into consideration all these variables, The TorchRec
+# planner can take in `various amounts of data for embedding tables,
+# constraints, hardware information, and
+# topology <https://github.com/pytorch/torchrec/blob/main/torchrec/distributed/planner/planners.py#L147-L155>`__
+# to aid in generating the optimal sharding plan for a model, which is
+# routinely provided across stacks.
+#
+# To learn more about sharding, see our `sharding
+# tutorial <https://pytorch.org/tutorials/advanced/sharding.html>`__.
+#
+
+# In our case, 1 GPU and compute on CUDA device
+planner = EmbeddingShardingPlanner(
+    topology=Topology(
+        world_size=1,
+        compute_device="cuda",
+    )
+)
+
+# Run planner to get plan for sharding
+plan = planner.collective_plan(ebc, [sharder], pg)
+
+print(f"Sharding Plan generated: {plan}")
+
+
+######################################################################
+# Planner Result
+# ~~~~~~~~~~~~~~
+#
+# As you can see above, when running the planner there is quite a bit of output.
+# We can see a lot of stats being calculated along with where our
+# tables end up being placed.
+#
+# The result of running the planner is a static plan, which can be reused
+# for sharding! This allows sharding to be static for production models
+# instead of determining a new sharding plan everytime. Below, we use the
+# sharding plan to finally generate our ``ShardedEmbeddingBagCollection``.
+#
+
+# The static plan that was generated
+plan
+
+env = ShardingEnv.from_process_group(pg)
+
+# Shard the ``EmbeddingBagCollection`` module using the ``EmbeddingBagCollectionSharder``
+sharded_ebc = sharder.shard(ebc, plan.plan[""], env, torch.device("cuda"))
+
+print(f"Sharded EBC Module: {sharded_ebc}")
+
+
+######################################################################
+# GPU Training with ``LazyAwaitable``
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+#
+# Remember that TorchRec is a highly optimized library for distributed
+# embeddings. A concept that TorchRec introduces to enable higher
+# performance for training on GPU is a
+# `LazyAwaitable `.
+# You will see ``LazyAwaitable`` types as outputs of various sharded
+# TorchRec modules. All a ``LazyAwaitable`` type does is delay calculating some
+# result as long as possible, and it does it by acting like an async type.
+#
+
+from typing import List
+
+from torchrec.distributed.types import LazyAwaitable
+
+
+# Demonstrate a ``LazyAwaitable`` type:
+class ExampleAwaitable(LazyAwaitable[torch.Tensor]):
+    def __init__(self, size: List[int]) -> None:
+        super().__init__()
+        self._size = size
+
+    def _wait_impl(self) -> torch.Tensor:
+        return torch.ones(self._size)
+
+
+awaitable = ExampleAwaitable([3, 2])
+awaitable.wait()
+
+kjt = kjt.to("cuda")
+output = sharded_ebc(kjt)
+# The output of our sharded ``EmbeddingBagCollection`` module is an `Awaitable`?
+print(output)
+
+kt = output.wait()
+# Now we have our ``KeyedTensor`` after calling ``.wait()``
+# If you are confused as to why we have a ``KeyedTensor ``output,
+# give yourself a refresher on the unsharded ``EmbeddingBagCollection`` module
+print(type(kt))
+
+print(kt.keys())
+
+print(kt.values().shape)
+
+# Same output format as unsharded ``EmbeddingBagCollection``
+result_dict = kt.to_dict()
+for key, embedding in result_dict.items():
+    print(key, embedding.shape)
+
+
+######################################################################
+# Anatomy of Sharded TorchRec modules
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# We have now successfully sharded an ``EmbeddingBagCollection`` given a
+# sharding plan that we generated! The sharded module has common APIs from
+# TorchRec which abstract away distributed communication/compute amongst
+# multiple GPUs. In fact, these APIs are highly optimized for performance
+# in training and inference. **Below are the three common APIs for
+# distributed training/inference** that are provided by TorchRec:
+#
+# * ``input_dist``: Handles distributing inputs from GPU to GPU.
+# * ``lookups``: Does the actual embedding lookup in an optimized,
+#   batched manner using FBGEMM TBE (more on this later).
+# * ``output_dist``: Handles distributing outputs from GPU to GPU.
+#
+# The distribution of inputs and outputs is done through `NCCL
+# Collectives <https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/overview.html>`__,
+# namely
+# `All-to-Alls <https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/p2p.html#all-to-all>`__,
+# which is where all GPUs send and receive data to and from one another.
+# TorchRec interfaces with PyTorch distributed for collectives and
+# provides clean abstractions to the end users, removing the concern for
+# the lower level details.
+#
+# The backwards pass does all of these collectives but in the reverse
+# order for distribution of gradients. ``input_dist``, ``lookup``, and
+# ``output_dist`` all depend on the sharding scheme. Since we sharded in a
+# table-wise fashion, these APIs are modules that are constructed by
+# `TwPooledEmbeddingSharding`.
+#
+
+sharded_ebc
+
+# Distribute input KJTs to all other GPUs and receive KJTs
+sharded_ebc._input_dists
+
+# Distribute output embeddings to all other GPUs and receive embeddings
+sharded_ebc._output_dists
+
+
+######################################################################
+# Optimizing Embedding Lookups
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# In performing lookups for a collection of embedding tables, a trivial
+# solution would be to iterate through all the ``nn.EmbeddingBags`` and do
+# a lookup per table. This is exactly what the standard, unsharded
+# ``EmbeddingBagCollection`` does. However, while this solution
+# is simple, it is extremely slow.
+#
+# `FBGEMM <https://github.com/pytorch/FBGEMM/tree/main/fbgemm_gpu>`__ is a
+# library that provides GPU operators (otherwise known as kernels) that
+# are very optimized. One of these operators is known as **Table Batched
+# Embedding** (TBE), provides two major optimizations:
+#
+# -  Table batching, which allows you to look up multiple embeddings with
+#    one kernel call.
+# -  Optimizer Fusion, which allows the module to update itself given the
+#    canonical pytorch optimizers and arguments.
+#
+# The ``ShardedEmbeddingBagCollection`` uses the FBGEMM TBE as the lookup
+# instead of traditional ``nn.EmbeddingBags`` for optimized embedding
+# lookups.
+#
+
+sharded_ebc._lookups
+
+
+######################################################################
+# ``DistributedModelParallel``
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# We have now explored sharding a single ``EmbeddingBagCollection``! We were
+# able to take the ``EmbeddingBagCollectionSharder`` and use the unsharded
+# ``EmbeddingBagCollection`` to generate a
+# ``ShardedEmbeddingBagCollection`` module. This workflow is fine, but
+# typically when implementing model parallel,
+# `DistributedModelParallel`
+# (DMP) is used as the standard interface. When wrapping your model (in
+# our case ``ebc``), with DMP, the following will occur:
+#
+# 1. Decide how to shard the model. DMP will collect the available
+#    sharders and come up with a plan of the optimal way to shard the
+#    embedding table(s) (for example, ``EmbeddingBagCollection``)
+# 2. Actually shard the model. This includes allocating memory for each
+#    embedding table on the appropriate device(s).
+#
+# DMP takes in everything that we've just experimented with, like a static
+# sharding plan, a list of sharders, etc. However, it also has some nice
+# defaults to seamlessly shard a TorchRec model. In this toy example,
+# since we have two embedding tables and one GPU, TorchRec will place both
+# on the single GPU.
+#
+
+ebc
+
+model = torchrec.distributed.DistributedModelParallel(ebc, device=torch.device("cuda"))
+
+out = model(kjt)
+out.wait()
+
+model
+
+
+from fbgemm_gpu.split_embedding_configs import EmbOptimType
+
+######################################################################
+# Sharding Best Practices
+# ~~~~~~~~~~~~~~~~~~~~~~~
+#
+# Currently, our configuration is only sharding on 1 GPU (or rank), which
+# is trivial: just place all the tables on 1 GPUs memory. However, in real
+# production use cases, embedding tables are **typically sharded on
+# hundreds of GPUs**, with different sharding methods such as table-wise,
+# row-wise, and column-wise. It is incredibly important to determine a
+# proper sharding configuration (to prevent out of memory issues) while
+# keeping it balanced not only in terms of memory but also compute for
+# optimal performance.
+#
+
+
+######################################################################
+# Adding in the Optimizer
+# ~~~~~~~~~~~~~~~~~~~~~~~
+#
+# Remember that TorchRec modules are hyperoptimized for large scale
+# distributed training. An important optimization is in regards to the
+# optimizer.
+#
+# TorchRec modules provide a seamless API to fuse the
+# backwards pass and optimize step in training, providing a significant
+# optimization in performance and decreasing the memory used, alongside
+# granularity in assigning distinct optimizers to distinct model
+# parameters.
+#
+# Optimizer Classes
+# ^^^^^^^^^^^^^^^^^
+#
+# TorchRec uses ``CombinedOptimizer``, which contains a collection of
+# ``KeyedOptimizers``. A ``CombinedOptimizer`` effectively makes it easy
+# to handle multiple optimizers for various sub groups in the model. A
+# ``KeyedOptimizer`` extends the ``torch.optim.Optimizer`` and is
+# initialized through a dictionary of parameters exposes the parameters.
+# Each ``TBE`` module in a ``EmbeddingBagCollection`` will have it's own
+# ``KeyedOptimizer`` which combines into one ``CombinedOptimizer``.
+#
+# Fused optimizer in TorchRec
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+#
+# Using ``DistributedModelParallel``, the **optimizer is fused, which
+# means that the optimizer update is done in the backward**. This is an
+# optimization in TorchRec and FBGEMM, where the optimizer embedding
+# gradients are not materialized and applied directly to the parameters.
+# This brings significant memory savings as embedding gradients are
+# typically size of the parameters themselves.
+#
+# You can, however, choose to make the optimizer ``dense`` which does not
+# apply this optimization and let's you inspect the embedding gradients or
+# apply computations to it as you wish. A dense optimizer in this case
+# would be your `canonical PyTorch model training loop with
+# optimizer. <https://pytorch.org/tutorials/beginner/basics/optimization_tutorial.html>`__
+#
+# Once the optimizer is created through ``DistributedModelParallel``, you
+# still need to manage an optimizer for the other parameters not
+# associated with TorchRec embedding modules. To find the other
+# parameters,
+# use ``in_backward_optimizer_filter(model.named_parameters())``.
+# Apply an optimizer to those parameters as you would a normal Torch
+# optimizer and combine this and the ``model.fused_optimizer`` into one
+# ``CombinedOptimizer`` that you can use in your training loop to
+# ``zero_grad`` and ``step`` through.
+#
+# Adding an Optimizer to ``EmbeddingBagCollection``
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+#
+# We will do this in two ways, which are equivalent, but give you options
+# depending on your preferences:
+#
+# 1. Passing optimizer kwargs through ``fused_params`` in sharder.
+# 2. Through ``apply_optimizer_in_backward``, which converts the optimizer
+#    parameters to ``fused_params`` to pass to the ``TBE`` in the ``EmbeddingBagCollection`` or ``EmbeddingCollection``.
+#
+
+# Option 1: Passing optimizer kwargs through fused parameters
+from torchrec.optim.optimizers import in_backward_optimizer_filter
+
+
+# We initialize the sharder with
+fused_params = {
+    "optimizer": EmbOptimType.EXACT_ROWWISE_ADAGRAD,
+    "learning_rate": 0.02,
+    "eps": 0.002,
+}
+
+# Initialize sharder with ``fused_params``
+sharder_with_fused_params = EmbeddingBagCollectionSharder(fused_params=fused_params)
+
+# We'll use same plan and unsharded EBC as before but this time with our new sharder
+sharded_ebc_fused_params = sharder_with_fused_params.shard(
+    ebc, plan.plan[""], env, torch.device("cuda")
+)
+
+# Looking at the optimizer of each, we can see that the learning rate changed, which indicates our optimizer has been applied correctly.
+# If seen, we can also look at the TBE logs of the cell to see that our new optimizer is indeed being applied
+print(f"Original Sharded EBC fused optimizer: {sharded_ebc.fused_optimizer}")
+print(
+    f"Sharded EBC with fused parameters fused optimizer: {sharded_ebc_fused_params.fused_optimizer}"
+)
+
+print(f"Type of optimizer: {type(sharded_ebc_fused_params.fused_optimizer)}")
+
+import copy
+
+from torch.distributed.optim import (
+    _apply_optimizer_in_backward as apply_optimizer_in_backward,
+)
+
+# Option 2: Applying optimizer through apply_optimizer_in_backward
+# Note: we need to call apply_optimizer_in_backward on unsharded model first and then shard it
+
+# We can achieve the same result as we did in the previous
+ebc_apply_opt = copy.deepcopy(ebc)
+optimizer_kwargs = {"lr": 0.5}
+
+for name, param in ebc_apply_opt.named_parameters():
+    print(f"{name=}")
+    apply_optimizer_in_backward(torch.optim.SGD, [param], optimizer_kwargs)
+
+sharded_ebc_apply_opt = sharder.shard(
+    ebc_apply_opt, plan.plan[""], env, torch.device("cuda")
+)
+
+# Now when we print the optimizer, we will see our new learning rate, you can verify momentum through the TBE logs as well if outputted
+print(sharded_ebc_apply_opt.fused_optimizer)
+print(type(sharded_ebc_apply_opt.fused_optimizer))
+
+# We can also check through the filter other parameters that aren't associated with the "fused" optimizer(s)
+# Practically, just non TorchRec module parameters. Since our module is just a TorchRec EBC
+# there are no other parameters that aren't associated with TorchRec
+print("Non Fused Model Parameters:")
+print(
+    dict(
+        in_backward_optimizer_filter(sharded_ebc_fused_params.named_parameters())
+    ).keys()
+)
+
+# Here we do a dummy backwards call and see that parameter updates for fused
+# optimizers happen as a result of the backward pass
+
+ebc_output = sharded_ebc_fused_params(kjt).wait().values()
+loss = torch.sum(torch.ones_like(ebc_output) - ebc_output)
+print(f"First Iteration Loss: {loss}")
+
+loss.backward()
+
+ebc_output = sharded_ebc_fused_params(kjt).wait().values()
+loss = torch.sum(torch.ones_like(ebc_output) - ebc_output)
+# We don't call an optimizer.step(), so for the loss to have changed here,
+# that means that the gradients were somehow updated, which is what the
+# fused optimizer automatically handles for us
+print(f"Second Iteration Loss: {loss}")
+
+
+######################################################################
+# Conclusion
+# ^^^^^^^^^^
+# In this tutorial, you have done training a distributed RecSys model
+# If you are interested in the inference the `TorchRec repo
+# <https://github.com/pytorch/torchrec/tree/main/torchrec/inference>`__ has a
+# full example of how to run the TorchRec in Inference mode.
+#
+
+
+######################################################################
+# See Also
+# --------------
+#
+# For more information, please see our
+# `dlrm <https://github.com/facebookresearch/dlrm/tree/main/torchrec_dlrm/>`__
+# example, which includes multinode training on the Criteo 1TB
+# dataset using the methods described in `Deep Learning Recommendation Model
+# for Personalization and Recommendation Systems <https://arxiv.org/abs/1906.00091>`__.
+#
diff --git a/intermediate_source/torchrec_tutorial.rst b/intermediate_source/torchrec_tutorial.rst
new file mode 100644
index 00000000000..883ca11087a
--- /dev/null
+++ b/intermediate_source/torchrec_tutorial.rst
@@ -0,0 +1,10 @@
+Introduction to TorchRec
+========================
+
+There is a newer tutorial on this topic.
+
+Redirecting...
+
+.. raw:: html
+
+   <meta http-equiv="Refresh" content="0; url='https://pytorch.org/tutorials/intermediate/torchrec_intro_tutorial.html'" />
diff --git a/intermediate_source/torchvision_tutorial.py b/intermediate_source/torchvision_tutorial.py
new file mode 100644
index 00000000000..a19d3930436
--- /dev/null
+++ b/intermediate_source/torchvision_tutorial.py
@@ -0,0 +1,534 @@
+# -*- coding: utf-8 -*-
+"""
+TorchVision Object Detection Finetuning Tutorial
+====================================================
+"""
+
+######################################################################
+#
+# For this tutorial, we will be finetuning a pre-trained `Mask
+# R-CNN <https://arxiv.org/abs/1703.06870>`_ model on the `Penn-Fudan
+# Database for Pedestrian Detection and
+# Segmentation <https://www.cis.upenn.edu/~jshi/ped_html/>`_. It contains
+# 170 images with 345 instances of pedestrians, and we will use it to
+# illustrate how to use the new features in torchvision in order to train
+# an object detection and instance segmentation model on a custom dataset.
+#
+#
+# .. note ::
+#
+#     This tutorial works only with torchvision version >=0.16 or nightly.
+#     If you're using torchvision<=0.15, please follow
+#     `this tutorial instead <https://github.com/pytorch/tutorials/blob/d686b662932a380a58b7683425faa00c06bcf502/intermediate_source/torchvision_tutorial.rst>`_.
+#
+#
+# Defining the Dataset
+# --------------------
+#
+# The reference scripts for training object detection, instance
+# segmentation and person keypoint detection allows for easily supporting
+# adding new custom datasets. The dataset should inherit from the standard
+# :class:`torch.utils.data.Dataset` class, and implement ``__len__`` and
+# ``__getitem__``.
+#
+# The only specificity that we require is that the dataset ``__getitem__``
+# should return a tuple:
+#
+# -  image: :class:`torchvision.tv_tensors.Image` of shape ``[3, H, W]``, a pure tensor, or a PIL Image of size ``(H, W)``
+# -  target: a dict containing the following fields
+#
+#    -  ``boxes``, :class:`torchvision.tv_tensors.BoundingBoxes` of shape ``[N, 4]``:
+#       the coordinates of the ``N`` bounding boxes in ``[x0, y0, x1, y1]`` format, ranging from ``0``
+#       to ``W`` and ``0`` to ``H``
+#    -  ``labels``, integer :class:`torch.Tensor` of shape ``[N]``: the label for each bounding box.
+#       ``0`` represents always the background class.
+#    -  ``image_id``, int: an image identifier. It should be
+#       unique between all the images in the dataset, and is used during
+#       evaluation
+#    -  ``area``, float :class:`torch.Tensor` of shape ``[N]``: the area of the bounding box. This is used
+#       during evaluation with the COCO metric, to separate the metric
+#       scores between small, medium and large boxes.
+#    -  ``iscrowd``, uint8 :class:`torch.Tensor` of shape ``[N]``: instances with ``iscrowd=True`` will be
+#       ignored during evaluation.
+#    -  (optionally) ``masks``, :class:`torchvision.tv_tensors.Mask` of shape ``[N, H, W]``: the segmentation
+#       masks for each one of the objects
+#
+# If your dataset is compliant with above requirements then it will work for both
+# training and evaluation codes from the reference script. Evaluation code will use scripts from
+# ``pycocotools`` which can be installed with ``pip install pycocotools``.
+#
+# .. note ::
+#   For Windows, please install ``pycocotools`` from `gautamchitnis <https://github.com/gautamchitnis/cocoapi>`_ with command
+#
+#   ``pip install git+https://github.com/gautamchitnis/cocoapi.git@cocodataset-master#subdirectory=PythonAPI``
+#
+# One note on the ``labels``. The model considers class ``0`` as background. If your dataset does not contain the background class,
+# you should not have ``0`` in your ``labels``. For example, assuming you have just two classes, *cat* and *dog*, you can
+# define ``1`` (not ``0``) to represent *cats* and ``2`` to represent *dogs*. So, for instance, if one of the images has both
+# classes, your ``labels`` tensor should look like ``[1, 2]``.
+#
+# Additionally, if you want to use aspect ratio grouping during training
+# (so that each batch only contains images with similar aspect ratios),
+# then it is recommended to also implement a ``get_height_and_width``
+# method, which returns the height and the width of the image. If this
+# method is not provided, we query all elements of the dataset via
+# ``__getitem__`` , which loads the image in memory and is slower than if
+# a custom method is provided.
+#
+# Writing a custom dataset for PennFudan
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# Let’s write a dataset for the PennFudan dataset. First, let's download the dataset and
+# extract the `zip file <https://www.cis.upenn.edu/~jshi/ped_html/PennFudanPed.zip>`_:
+#
+# .. code:: python
+#
+#     wget https://www.cis.upenn.edu/~jshi/ped_html/PennFudanPed.zip -P data
+#     cd data && unzip PennFudanPed.zip
+#
+#
+# We have the following folder structure:
+#
+# ::
+#
+#    PennFudanPed/
+#      PedMasks/
+#        FudanPed00001_mask.png
+#        FudanPed00002_mask.png
+#        FudanPed00003_mask.png
+#        FudanPed00004_mask.png
+#        ...
+#      PNGImages/
+#        FudanPed00001.png
+#        FudanPed00002.png
+#        FudanPed00003.png
+#        FudanPed00004.png
+#
+# Here is one example of a pair of images and segmentation masks
+
+import matplotlib.pyplot as plt
+from torchvision.io import read_image
+
+
+image = read_image("data/PennFudanPed/PNGImages/FudanPed00046.png")
+mask = read_image("data/PennFudanPed/PedMasks/FudanPed00046_mask.png")
+
+plt.figure(figsize=(16, 8))
+plt.subplot(121)
+plt.title("Image")
+plt.imshow(image.permute(1, 2, 0))
+plt.subplot(122)
+plt.title("Mask")
+plt.imshow(mask.permute(1, 2, 0))
+
+######################################################################
+# So each image has a corresponding
+# segmentation mask, where each color correspond to a different instance.
+# Let’s write a :class:`torch.utils.data.Dataset` class for this dataset.
+# In the code below, we are wrapping images, bounding boxes and masks into
+# :class:`torchvision.tv_tensors.TVTensor` classes so that we will be able to apply torchvision
+# built-in transformations (`new Transforms API <https://pytorch.org/vision/stable/transforms.html>`_)
+# for the given object detection and segmentation task.
+# Namely, image tensors will be wrapped by :class:`torchvision.tv_tensors.Image`, bounding boxes into
+# :class:`torchvision.tv_tensors.BoundingBoxes` and masks into :class:`torchvision.tv_tensors.Mask`.
+# As :class:`torchvision.tv_tensors.TVTensor` are :class:`torch.Tensor` subclasses, wrapped objects are also tensors and inherit the plain
+# :class:`torch.Tensor` API. For more information about torchvision ``tv_tensors`` see
+# `this documentation <https://pytorch.org/vision/main/auto_examples/transforms/plot_transforms_getting_started.html#what-are-tvtensors>`_.
+
+import os
+import torch
+
+from torchvision.io import read_image
+from torchvision.ops.boxes import masks_to_boxes
+from torchvision import tv_tensors
+from torchvision.transforms.v2 import functional as F
+
+
+class PennFudanDataset(torch.utils.data.Dataset):
+    def __init__(self, root, transforms):
+        self.root = root
+        self.transforms = transforms
+        # load all image files, sorting them to
+        # ensure that they are aligned
+        self.imgs = list(sorted(os.listdir(os.path.join(root, "PNGImages"))))
+        self.masks = list(sorted(os.listdir(os.path.join(root, "PedMasks"))))
+
+    def __getitem__(self, idx):
+        # load images and masks
+        img_path = os.path.join(self.root, "PNGImages", self.imgs[idx])
+        mask_path = os.path.join(self.root, "PedMasks", self.masks[idx])
+        img = read_image(img_path)
+        mask = read_image(mask_path)
+        # instances are encoded as different colors
+        obj_ids = torch.unique(mask)
+        # first id is the background, so remove it
+        obj_ids = obj_ids[1:]
+        num_objs = len(obj_ids)
+
+        # split the color-encoded mask into a set
+        # of binary masks
+        masks = (mask == obj_ids[:, None, None]).to(dtype=torch.uint8)
+
+        # get bounding box coordinates for each mask
+        boxes = masks_to_boxes(masks)
+
+        # there is only one class
+        labels = torch.ones((num_objs,), dtype=torch.int64)
+
+        image_id = idx
+        area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
+        # suppose all instances are not crowd
+        iscrowd = torch.zeros((num_objs,), dtype=torch.int64)
+
+        # Wrap sample and targets into torchvision tv_tensors:
+        img = tv_tensors.Image(img)
+
+        target = {}
+        target["boxes"] = tv_tensors.BoundingBoxes(boxes, format="XYXY", canvas_size=F.get_size(img))
+        target["masks"] = tv_tensors.Mask(masks)
+        target["labels"] = labels
+        target["image_id"] = image_id
+        target["area"] = area
+        target["iscrowd"] = iscrowd
+
+        if self.transforms is not None:
+            img, target = self.transforms(img, target)
+
+        return img, target
+
+    def __len__(self):
+        return len(self.imgs)
+
+######################################################################
+# That’s all for the dataset. Now let’s define a model that can perform
+# predictions on this dataset.
+#
+# Defining your model
+# -------------------
+#
+# In this tutorial, we will be using `Mask
+# R-CNN <https://arxiv.org/abs/1703.06870>`_, which is based on top of
+# `Faster R-CNN <https://arxiv.org/abs/1506.01497>`_. Faster R-CNN is a
+# model that predicts both bounding boxes and class scores for potential
+# objects in the image.
+#
+# .. image:: ../../_static/img/tv_tutorial/tv_image03.png
+#
+# Mask R-CNN adds an extra branch
+# into Faster R-CNN, which also predicts segmentation masks for each
+# instance.
+#
+# .. image:: ../../_static/img/tv_tutorial/tv_image04.png
+#
+# There are two common
+# situations where one might want
+# to modify one of the available models in TorchVision Model Zoo. The first
+# is when we want to start from a pre-trained model, and just finetune the
+# last layer. The other is when we want to replace the backbone of the
+# model with a different one (for faster predictions, for example).
+#
+# Let’s go see how we would do one or another in the following sections.
+#
+# 1 - Finetuning from a pretrained model
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# Let’s suppose that you want to start from a model pre-trained on COCO
+# and want to finetune it for your particular classes. Here is a possible
+# way of doing it:
+
+
+import torchvision
+from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
+
+# load a model pre-trained on COCO
+model = torchvision.models.detection.fasterrcnn_resnet50_fpn(weights="DEFAULT")
+
+# replace the classifier with a new one, that has
+# num_classes which is user-defined
+num_classes = 2  # 1 class (person) + background
+# get number of input features for the classifier
+in_features = model.roi_heads.box_predictor.cls_score.in_features
+# replace the pre-trained head with a new one
+model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
+
+######################################################################
+# 2 - Modifying the model to add a different backbone
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+
+import torchvision
+from torchvision.models.detection import FasterRCNN
+from torchvision.models.detection.rpn import AnchorGenerator
+
+# load a pre-trained model for classification and return
+# only the features
+backbone = torchvision.models.mobilenet_v2(weights="DEFAULT").features
+# ``FasterRCNN`` needs to know the number of
+# output channels in a backbone. For mobilenet_v2, it's 1280
+# so we need to add it here
+backbone.out_channels = 1280
+
+# let's make the RPN generate 5 x 3 anchors per spatial
+# location, with 5 different sizes and 3 different aspect
+# ratios. We have a Tuple[Tuple[int]] because each feature
+# map could potentially have different sizes and
+# aspect ratios
+anchor_generator = AnchorGenerator(
+    sizes=((32, 64, 128, 256, 512),),
+    aspect_ratios=((0.5, 1.0, 2.0),)
+)
+
+# let's define what are the feature maps that we will
+# use to perform the region of interest cropping, as well as
+# the size of the crop after rescaling.
+# if your backbone returns a Tensor, featmap_names is expected to
+# be [0]. More generally, the backbone should return an
+# ``OrderedDict[Tensor]``, and in ``featmap_names`` you can choose which
+# feature maps to use.
+roi_pooler = torchvision.ops.MultiScaleRoIAlign(
+    featmap_names=['0'],
+    output_size=7,
+    sampling_ratio=2
+)
+
+# put the pieces together inside a Faster-RCNN model
+model = FasterRCNN(
+    backbone,
+    num_classes=2,
+    rpn_anchor_generator=anchor_generator,
+    box_roi_pool=roi_pooler
+)
+
+######################################################################
+# Object detection and instance segmentation model for PennFudan Dataset
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# In our case, we want to finetune from a pre-trained model, given that
+# our dataset is very small, so we will be following approach number 1.
+#
+# Here we want to also compute the instance segmentation masks, so we will
+# be using Mask R-CNN:
+
+
+import torchvision
+from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
+from torchvision.models.detection.mask_rcnn import MaskRCNNPredictor
+
+
+def get_model_instance_segmentation(num_classes):
+    # load an instance segmentation model pre-trained on COCO
+    model = torchvision.models.detection.maskrcnn_resnet50_fpn(weights="DEFAULT")
+
+    # get number of input features for the classifier
+    in_features = model.roi_heads.box_predictor.cls_score.in_features
+    # replace the pre-trained head with a new one
+    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
+
+    # now get the number of input features for the mask classifier
+    in_features_mask = model.roi_heads.mask_predictor.conv5_mask.in_channels
+    hidden_layer = 256
+    # and replace the mask predictor with a new one
+    model.roi_heads.mask_predictor = MaskRCNNPredictor(
+        in_features_mask,
+        hidden_layer,
+        num_classes
+    )
+
+    return model
+
+
+######################################################################
+# That’s it, this will make ``model`` be ready to be trained and evaluated
+# on your custom dataset.
+#
+# Putting everything together
+# ---------------------------
+#
+# In ``references/detection/``, we have a number of helper functions to
+# simplify training and evaluating detection models. Here, we will use
+# ``references/detection/engine.py`` and ``references/detection/utils.py``.
+# Just download everything under ``references/detection`` to your folder and use them here.
+# On Linux if you have ``wget``, you can download them using below commands:
+
+os.system("wget https://raw.githubusercontent.com/pytorch/vision/main/references/detection/engine.py")
+os.system("wget https://raw.githubusercontent.com/pytorch/vision/main/references/detection/utils.py")
+os.system("wget https://raw.githubusercontent.com/pytorch/vision/main/references/detection/coco_utils.py")
+os.system("wget https://raw.githubusercontent.com/pytorch/vision/main/references/detection/coco_eval.py")
+os.system("wget https://raw.githubusercontent.com/pytorch/vision/main/references/detection/transforms.py")
+
+######################################################################
+# Since v0.15.0 torchvision provides `new Transforms API <https://pytorch.org/vision/stable/transforms.html>`_
+# to easily write data augmentation pipelines for Object Detection and Segmentation tasks.
+#
+# Let’s write some helper functions for data augmentation /
+# transformation:
+
+from torchvision.transforms import v2 as T
+
+
+def get_transform(train):
+    transforms = []
+    if train:
+        transforms.append(T.RandomHorizontalFlip(0.5))
+    transforms.append(T.ToDtype(torch.float, scale=True))
+    transforms.append(T.ToPureTensor())
+    return T.Compose(transforms)
+
+######################################################################
+# Testing ``forward()`` method (Optional)
+# ---------------------------------------
+#
+# Before iterating over the dataset, it's good to see what the model
+# expects during training and inference time on sample data.
+import utils
+
+model = torchvision.models.detection.fasterrcnn_resnet50_fpn(weights="DEFAULT")
+dataset = PennFudanDataset('data/PennFudanPed', get_transform(train=True))
+data_loader = torch.utils.data.DataLoader(
+    dataset,
+    batch_size=2,
+    shuffle=True,
+    collate_fn=utils.collate_fn
+)
+
+# For Training
+images, targets = next(iter(data_loader))
+images = list(image for image in images)
+targets = [{k: v for k, v in t.items()} for t in targets]
+output = model(images, targets)  # Returns losses and detections
+print(output)
+
+# For inference
+model.eval()
+x = [torch.rand(3, 300, 400), torch.rand(3, 500, 400)]
+predictions = model(x)  # Returns predictions
+print(predictions[0])
+
+
+######################################################################
+# We want to be able to train our model on an `accelerator <https://pytorch.org/docs/stable/torch.html#accelerators>`_
+# such as CUDA, MPS, MTIA, or XPU. Let’s now write the main function which performs the training and the validation:
+
+
+from engine import train_one_epoch, evaluate
+
+# train on the accelerator or on the CPU, if an accelerator is not available
+device = torch.accelerator.current_accelerator() if torch.accelerator.is_available() else torch.device('cpu')
+
+# our dataset has two classes only - background and person
+num_classes = 2
+# use our dataset and defined transformations
+dataset = PennFudanDataset('data/PennFudanPed', get_transform(train=True))
+dataset_test = PennFudanDataset('data/PennFudanPed', get_transform(train=False))
+
+# split the dataset in train and test set
+indices = torch.randperm(len(dataset)).tolist()
+dataset = torch.utils.data.Subset(dataset, indices[:-50])
+dataset_test = torch.utils.data.Subset(dataset_test, indices[-50:])
+
+# define training and validation data loaders
+data_loader = torch.utils.data.DataLoader(
+    dataset,
+    batch_size=2,
+    shuffle=True,
+    collate_fn=utils.collate_fn
+)
+
+data_loader_test = torch.utils.data.DataLoader(
+    dataset_test,
+    batch_size=1,
+    shuffle=False,
+    collate_fn=utils.collate_fn
+)
+
+# get the model using our helper function
+model = get_model_instance_segmentation(num_classes)
+
+# move model to the right device
+model.to(device)
+
+# construct an optimizer
+params = [p for p in model.parameters() if p.requires_grad]
+optimizer = torch.optim.SGD(
+    params,
+    lr=0.005,
+    momentum=0.9,
+    weight_decay=0.0005
+)
+
+# and a learning rate scheduler
+lr_scheduler = torch.optim.lr_scheduler.StepLR(
+    optimizer,
+    step_size=3,
+    gamma=0.1
+)
+
+# let's train it just for 2 epochs
+num_epochs = 2
+
+for epoch in range(num_epochs):
+    # train for one epoch, printing every 10 iterations
+    train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq=10)
+    # update the learning rate
+    lr_scheduler.step()
+    # evaluate on the test dataset
+    evaluate(model, data_loader_test, device=device)
+
+print("That's it!")
+
+
+
+######################################################################
+# So after one epoch of training, we obtain a COCO-style mAP > 50, and
+# a mask mAP of 65.
+#
+# But what do the predictions look like? Let’s take one image in the
+# dataset and verify
+#
+import matplotlib.pyplot as plt
+
+from torchvision.utils import draw_bounding_boxes, draw_segmentation_masks
+
+
+image = read_image("data/PennFudanPed/PNGImages/FudanPed00046.png")
+eval_transform = get_transform(train=False)
+
+model.eval()
+with torch.no_grad():
+    x = eval_transform(image)
+    # convert RGBA -> RGB and move to device
+    x = x[:3, ...].to(device)
+    predictions = model([x, ])
+    pred = predictions[0]
+
+
+image = (255.0 * (image - image.min()) / (image.max() - image.min())).to(torch.uint8)
+image = image[:3, ...]
+pred_labels = [f"pedestrian: {score:.3f}" for label, score in zip(pred["labels"], pred["scores"])]
+pred_boxes = pred["boxes"].long()
+output_image = draw_bounding_boxes(image, pred_boxes, pred_labels, colors="red")
+
+masks = (pred["masks"] > 0.7).squeeze(1)
+output_image = draw_segmentation_masks(output_image, masks, alpha=0.5, colors="blue")
+
+
+plt.figure(figsize=(12, 12))
+plt.imshow(output_image.permute(1, 2, 0))
+
+######################################################################
+# The results look good!
+#
+# Wrapping up
+# -----------
+#
+# In this tutorial, you have learned how to create your own training
+# pipeline for object detection models on a custom dataset. For
+# that, you wrote a :class:`torch.utils.data.Dataset` class that returns the
+# images and the ground truth boxes and segmentation masks. You also
+# leveraged a Mask R-CNN model pre-trained on COCO train2017 in order to
+# perform transfer learning on this new dataset.
+#
+# For a more complete example, which includes multi-machine / multi-GPU
+# training, check ``references/detection/train.py``, which is present in
+# the torchvision repository.
+#
diff --git a/intermediate_source/transformer_building_blocks.py b/intermediate_source/transformer_building_blocks.py
new file mode 100644
index 00000000000..decaf0602f7
--- /dev/null
+++ b/intermediate_source/transformer_building_blocks.py
@@ -0,0 +1,846 @@
+"""
+.. meta::
+  :description: Learn how to optimize transformer models by replacing nn.Transformer with Nested Tensors and torch.compile() for significant performance gains in PyTorch.
+
+Accelerating PyTorch Transformers by replacing ``nn.Transformer`` with Nested Tensors and ``torch.compile()``
+=============================================================================================================
+**Author:** `Mikayla Gawarecki <https://github.com/mikaylagawarecki>`_
+
+
+.. grid:: 2
+
+    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn
+       :class-card: card-prerequisites
+
+       * Learn about the low-level building blocks PyTorch provides to build custom transformer layers (
+         nested tensors, ``scaled_dot_product_attention``, ``torch.compile()``, and ``FlexAttention``)
+       * Discover how the above improve memory usage and performance using MultiHeadAttention as an example
+       * Explore advanced customizations using the aforementioned building blocks
+    
+    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites
+       :class-card: card-prerequisites
+
+       * PyTorch v.2.6.0 or later
+
+
+Over the past few years, the PyTorch team has developed various lower level
+features that, when composed, can create a variety of transformer variants. These
+include:
+
+*   Nested Tensors with the ``torch.jagged`` layout (AKA NJTs)
+*   ``scaled_dot_product_attention``
+*   ``torch.compile()``
+*   ``FlexAttention``
+
+This tutorial will give a brief overview of the above technologies and
+demonstrate how they can be composed to yield flexible and performant transformer \
+layers with improved user experience.
+
+One may observe that the ``torch.nn`` module currently provides various ``Transformer``-related layers.
+In particular, it includes ``TransformerEncoderLayer``, ``TransformerEncoder``, ``TransformerDecoderLayer``,
+``TransformerDecoder``, ``Transformer`` and ``MultiheadAttention``. This family
+of layers was initially implemented following the `Attention is All
+You Need <https://arxiv.org/abs/1706.03762>`_ paper. The components discussed in
+this tutorial provide improved user experience, flexibility and performance over
+the existing ``nn`` layers.
+
+
+Is this tutorial for me?
+========================
+
+If you are wondering about what building blocks the ``torch`` library provides
+for writing your own transformer layers and best practices, you are in the
+right place. Please keep reading!
+
+If you are looking for an out-of-the-box implementation of a popular transformer
+architecture, note that there are many open-source libraries that provide them,
+including:
+
+* `HuggingFace transformers <https://github.com/huggingface/transformers>`_
+* `xformers <https://github.com/facebookresearch/xformers>`_
+* `torchtune <https://github.com/pytorch/torchtune>`_
+
+If you are only interested in performant attention score modifications, please
+check out the `FlexAttention blog <https://pytorch.org/blog/flexattention/>`_ that
+contains a `gym of masks <https://github.com/meta-pytorch/attention-gym>`_.
+
+"""
+
+################################################################################
+# Introducing the Building Blocks
+# ===============================
+# First, we will briefly introduce the four technologies mentioned in the introduction
+#
+# * `torch.nested <https://pytorch.org/tutorials/unstable/nestedtensor.html>`_
+#
+# Nested tensors generalize the shape of regular dense tensors, allowing for
+# representation of ragged-sized data with the same tensor UX. In the context of
+# transformers, we can think of nested tensors as a tool for representing variable
+# sequence lengths. They eliminate the need for the bug-prone practices of explicit
+# padding and masking (think ``key_padding_mask`` in ``nn.MultiHeadAttention``).
+#
+# * `scaled_dot_product_attention <https://pytorch.org/tutorials/intermediate/scaled_dot_product_attention_tutorial.html>`_
+#
+# ``scaled_dot_product_attention`` is a primitive for
+# :math:`\text{softmax}(\frac{QK^T}{\sqrt{E}} + B)V` that dispatches into either fused
+# implementations of the operator or a fallback implementation. It works out of
+# the box in eager mode (i.e. the default mode of using PyTorch where operations
+# are executed on the fly as they are encountered) and also integrates seamlessly
+# with ``torch.compile()``. As of 2.6, it will also offer grouped query attention
+# natively.
+#
+# * `torch.compile() <https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html>`_
+#
+# ``torch.compile()`` is a compiler introduced in version 2.0 that is able to
+# capture a graph of PyTorch code and perform various optimizations on it, such as
+# fusing together sequences of ops. Nested tensors with the ``torch.jagged`` layout
+# and ``scaled_dot_product_attention`` work seamlessly with compile. In the
+# context of transformers, the value add of using compile with nested tensor
+# and SDPA is that compile can remove framework overhead ones sees in eager mode
+# and fuse sequences of ops in transformers together, such as projection and
+# activation.
+#
+# * `FlexAttention <https://pytorch.org/blog/flexattention/>`_
+#
+# ``FlexAttention`` is a primitive that allows users to modify attention scores
+# prior to the softmax operation. It generalizes the additive ``B`` term above
+# for ``scaled_dot_product_attention``, allowing for arbitrary calculation. It
+# requires compile to achieve good performance.
+#
+# The above building blocks are "All You Need" (as of October 2024)
+# ==================================================================
+#
+# The main premise in this section is that most transformer variations are
+# GPT-style, consisting of layers like Embedding, Positional Encoding, Attention
+# Blocks and Feed Forward networks. If we were to try to classify the differences
+# in this space, we might land on something like:
+#
+# 1.   Layer type (activation functions such as ``SwiGLU`` and others, normalization functions
+#      such as ``RMSNorm`` and others, positional encodings, such as Sinusoidal, Rotary.)
+# 2.   Layer ordering, such as where to apply norms and positional encoding.
+# 3.   Modifications to attention score, such as ``ALiBi``, Relative Positional Bias and so on.
+#
+#
+# In a pre-compiler environment, you might write a custom transformer and notice
+# that it functions correctly but is slow. To address this, you might develop a
+# custom fused kernel for the specific series of operations. In a compiler environment,
+# you can simply perform the initial step and then compile and benefit from improved performance.
+
+
+###############################################################################
+# MultiheadAttention
+# ------------------
+# Remember that MultiheadAttention takes in a query, key, and value, and consists
+# of an input projection, a ``scaled_dot_product_attention`` operator and an
+# output projection. The main takeaway we want to demonstrate here is the
+# improvement yielded when we replaced padded/masked inputs with nested tensors.
+# The improvements are threefold:
+#
+# * **User Experience**
+#   Remember that ``nn.MultiheadAttention`` requires ``query``, ``key``, and
+#   ``value`` to be dense ``torch.Tensors``. It also provides a
+#   ``key_padding_mask`` that is used to mask out padding tokens in the ``key``
+#   that arise due to different sequence lengths within a batch. Since there is
+#   no ``query_padding_mask`` in ``nn.MHA``, users have to take care to mask/slice
+#   the outputs appropriately to account for query sequence lengths. ``NestedTensor``
+#   cleanly removes the need for this sort of error-prone padding masks.
+#
+# * **Memory**
+#   Instead of materializing a dense ``[B, S, D]`` tensor with a ``[B, S]``
+#   padding mask (where ``B`` is batch size, ``S`` is max sequence length in the
+#   batch and ``D`` is embedding size), nested tensors allow you to cleanly
+#   represent the batch of varying sequence lengths. As a result, the input and
+#   intermediate activations will use less memory.
+#
+# * **Performance**
+#   Since padding is not materialized and unnecessary computation on padding is
+#   skipped, performance and memory usage improve.
+#
+# We'll demonstrate the above by building upon the ``MultiheadAttention`` layer in the
+# `Nested Tensor tutorial <https://pytorch.org/tutorials/unstable/nestedtensor.html>`_
+# and comparing it to the ``nn.MultiheadAttention`` layer.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class MultiHeadAttention(nn.Module):
+    """
+    Computes multi-head attention. Supports nested or padded tensors.
+
+    Args:
+        E_q (int): Size of embedding dim for query
+        E_k (int): Size of embedding dim for key
+        E_v (int): Size of embedding dim for value
+        E_total (int): Total embedding dim of combined heads post input projection. Each head
+            has dim E_total // nheads
+        nheads (int): Number of heads
+        dropout (float, optional): Dropout probability. Default: 0.0
+        bias (bool, optional): Whether to add bias to input projection. Default: True
+    """
+
+    def __init__(
+        self,
+        E_q: int,
+        E_k: int,
+        E_v: int,
+        E_total: int,
+        nheads: int,
+        dropout: float = 0.0,
+        bias=True,
+        device=None,
+        dtype=None,
+    ):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.nheads = nheads
+        self.dropout = dropout
+        self._qkv_same_embed_dim = E_q == E_k and E_q == E_v
+        if self._qkv_same_embed_dim:
+            self.packed_proj = nn.Linear(E_q, E_total * 3, bias=bias, **factory_kwargs)
+        else:
+            self.q_proj = nn.Linear(E_q, E_total, bias=bias, **factory_kwargs)
+            self.k_proj = nn.Linear(E_k, E_total, bias=bias, **factory_kwargs)
+            self.v_proj = nn.Linear(E_v, E_total, bias=bias, **factory_kwargs)
+        E_out = E_q
+        self.out_proj = nn.Linear(E_total, E_out, bias=bias, **factory_kwargs)
+        assert E_total % nheads == 0, "Embedding dim is not divisible by nheads"
+        self.E_head = E_total // nheads
+        self.bias = bias
+
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        attn_mask=None,
+        is_causal=False,
+    ) -> torch.Tensor:
+        """
+        Forward pass; runs the following process:
+            1. Apply input projection
+            2. Split heads and prepare for SDPA
+            3. Run SDPA
+            4. Apply output projection
+
+        Args:
+            query (torch.Tensor): query of shape (``N``, ``L_q``, ``E_qk``)
+            key (torch.Tensor): key of shape (``N``, ``L_kv``, ``E_qk``)
+            value (torch.Tensor): value of shape (``N``, ``L_kv``, ``E_v``)
+            attn_mask (torch.Tensor, optional): attention mask of shape (``N``, ``L_q``, ``L_kv``) to pass to SDPA. Default: None
+            is_causal (bool, optional): Whether to apply causal mask. Default: False
+
+        Returns:
+            attn_output (torch.Tensor): output of shape (N, L_t, E_q)
+        """
+        # Step 1. Apply input projection
+        if self._qkv_same_embed_dim:
+            if query is key and key is value:
+                result = self.packed_proj(query)
+                query, key, value = torch.chunk(result, 3, dim=-1)
+            else:
+                q_weight, k_weight, v_weight = torch.chunk(
+                    self.packed_proj.weight, 3, dim=0
+                )
+                if self.bias:
+                    q_bias, k_bias, v_bias = torch.chunk(
+                        self.packed_proj.bias, 3, dim=0
+                    )
+                else:
+                    q_bias, k_bias, v_bias = None, None, None
+                query, key, value = (
+                    F.linear(query, q_weight, q_bias),
+                    F.linear(key, k_weight, k_bias),
+                    F.linear(value, v_weight, v_bias),
+                )
+
+        else:
+            query = self.q_proj(query)
+            key = self.k_proj(key)
+            value = self.v_proj(value)
+
+        # Step 2. Split heads and prepare for SDPA
+        # reshape query, key, value to separate by head
+        # (N, L_t, E_total) -> (N, L_t, nheads, E_head) -> (N, nheads, L_t, E_head)
+        query = query.unflatten(-1, [self.nheads, self.E_head]).transpose(1, 2)
+        # (N, L_s, E_total) -> (N, L_s, nheads, E_head) -> (N, nheads, L_s, E_head)
+        key = key.unflatten(-1, [self.nheads, self.E_head]).transpose(1, 2)
+        # (N, L_s, E_total) -> (N, L_s, nheads, E_head) -> (N, nheads, L_s, E_head)
+        value = value.unflatten(-1, [self.nheads, self.E_head]).transpose(1, 2)
+
+        # Step 3. Run SDPA
+        # (N, nheads, L_t, E_head)
+        attn_output = F.scaled_dot_product_attention(
+            query, key, value, dropout_p=self.dropout, is_causal=is_causal
+        )
+        # (N, nheads, L_t, E_head) -> (N, L_t, nheads, E_head) -> (N, L_t, E_total)
+        attn_output = attn_output.transpose(1, 2).flatten(-2)
+
+        # Step 4. Apply output projection
+        # (N, L_t, E_total) -> (N, L_t, E_out)
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output
+
+
+###############################################################################
+# Utilities
+# ~~~~~~~~~
+# In this section, we include a utility to generate semi-realistic data using
+# ``Zipf`` distribution for sentence lengths. This is used to generate the nested
+# query, key, and value tensors. We also include a benchmark utility.
+
+
+import numpy as np
+
+
+def zipf_sentence_lengths(alpha: float, batch_size: int) -> torch.Tensor:
+    # generate fake corpus by unigram Zipf distribution
+    # from wikitext-2 corpus, we get rank "." = 3, "!" = 386, "?" = 858
+    sentence_lengths = np.empty(batch_size, dtype=int)
+    for ibatch in range(batch_size):
+        sentence_lengths[ibatch] = 1
+        word = np.random.zipf(alpha)
+        while word != 3 and word != 386 and word != 858:
+            sentence_lengths[ibatch] += 1
+            word = np.random.zipf(alpha)
+    return torch.tensor(sentence_lengths)
+
+
+# Generate a batch of semi-realistic data using Zipf distribution for sentence lengths
+# in the form of nested tensors with the jagged layout.
+def gen_batch(N, E_q, E_k, E_v, device, dtype=torch.float32, query_seq_len_1=False):
+    # generate semi-realistic data using Zipf distribution for sentence lengths
+    sentence_lengths = zipf_sentence_lengths(alpha=1.2, batch_size=N)
+
+    # Note: the torch.jagged layout is a nested tensor layout that supports a single ragged
+    # dimension and works with torch.compile. The batch items each have shape (B, S*, D)
+    # where B = batch size, S* = ragged sequence length, and D = embedding dimension.
+    if query_seq_len_1:
+        query = torch.nested.nested_tensor(
+            [torch.randn(1, E_q, dtype=dtype, device=device) for l in sentence_lengths],
+            layout=torch.jagged,
+        )
+    else:
+        query = torch.nested.nested_tensor(
+            [
+                torch.randn(l.item(), E_q, dtype=dtype, device=device)
+                for l in sentence_lengths
+            ],
+            layout=torch.jagged,
+        )
+
+    key = torch.nested.nested_tensor(
+        [
+            torch.randn(s.item(), E_k, dtype=dtype, device=device)
+            for s in sentence_lengths
+        ],
+        layout=torch.jagged,
+    )
+
+    value = torch.nested.nested_tensor(
+        [
+            torch.randn(s.item(), E_v, dtype=dtype, device=device)
+            for s in sentence_lengths
+        ],
+        layout=torch.jagged,
+    )
+
+    return query, key, value, sentence_lengths
+
+
+import math
+import timeit
+
+
+def benchmark(func, *args, **kwargs):
+    torch.cuda.synchronize()
+    torch.cuda.reset_peak_memory_stats()
+    begin = timeit.default_timer()
+    output = func(*args, **kwargs)
+    torch.cuda.synchronize()
+    end = timeit.default_timer()
+    return output, (end - begin), torch.cuda.max_memory_allocated()
+
+
+##############################################################################
+# We will now demonstrate the performance improvements of using nested tensors
+# in the ``MultiheadAttention`` layer + compile for self attention. We compare this against
+# the traditional ``nn.MultiheadAttention`` + compile with padding and masking.
+
+N, E_q, E_k, E_v, E_total = 512, 512, 512, 512, 512
+E_out = E_q
+d_model = E_q
+nheads = 8
+dropout = 0.0
+bias = True
+device = "cuda"
+torch.manual_seed(6)
+query, key, value, sentence_lengths = gen_batch(N, E_q, E_k, E_v, device)
+S = sentence_lengths.max().item()
+print(
+    f"Total sequence length in nested query {sentence_lengths.sum().item()}, max sequence length {S}"
+)
+padded_query, padded_key, padded_value = (
+    t.to_padded_tensor(0.0) for t in (query, key, value)
+)
+
+torch.manual_seed(6)
+mha_layer = MultiHeadAttention(
+    E_q, E_k, E_v, E_total, nheads, dropout=dropout, bias=bias, device="cuda"
+)
+torch.manual_seed(6)
+vanilla_mha_layer = nn.MultiheadAttention(
+    E_q, nheads, dropout=dropout, batch_first=True, bias=bias, device="cuda"
+)
+
+# ``nn.MultiheadAttention`` uses a non conventional initialization for layers, so do this for exact parity :(
+mha_layer.out_proj.weight = nn.Parameter(
+    vanilla_mha_layer.out_proj.weight.clone().detach()
+)
+mha_layer.packed_proj.weight = nn.Parameter(
+    vanilla_mha_layer.in_proj_weight.clone().detach()
+)
+mha_layer.out_proj.bias = nn.Parameter(vanilla_mha_layer.out_proj.bias.clone().detach())
+mha_layer.packed_proj.bias = nn.Parameter(
+    vanilla_mha_layer.in_proj_bias.clone().detach()
+)
+
+new_mha_layer = torch.compile(mha_layer)
+# warmup compile
+nested_result_warmup = new_mha_layer(query, query, query, is_causal=True)
+
+# benchmark
+nested_result, nested_time, nested_peak_memory = benchmark(
+    new_mha_layer, query, query, query, is_causal=True
+)
+padded_nested_result = nested_result.to_padded_tensor(0.0)
+
+# For the vanilla ``nn.MultiheadAttention``, we need to construct the ``key_padding_mask``
+# Further, ``nn.MultiheadAttention`` forces one to materialize the ``attn_mask`` even if using ``is_causal``
+src_key_padding_mask = torch.where(padded_query == 0.0, -math.inf, 0)[:, :, 0]
+attn_mask = torch.empty((N, S, S), device=device).fill_(float("-inf"))
+for i, s in enumerate(sentence_lengths):
+    attn_mask[i, :s, :s] = nn.Transformer.generate_square_subsequent_mask(s)
+attn_mask = attn_mask.unsqueeze(1).expand(N, nheads, S, S).reshape(N * nheads, S, S)
+
+vanilla_mha_layer = torch.compile(vanilla_mha_layer)
+# warmup compile
+warmup_vanilla_result = vanilla_mha_layer(
+    padded_query,
+    padded_query,
+    padded_query,
+    attn_mask=attn_mask,
+    key_padding_mask=src_key_padding_mask,
+    need_weights=False,
+    is_causal=True,
+)
+
+# benchmark
+(padded_result, _), padded_time, padded_peak_memory = benchmark(
+    vanilla_mha_layer,
+    padded_query,
+    padded_query,
+    padded_query,
+    key_padding_mask=src_key_padding_mask,
+    need_weights=False,
+    attn_mask=attn_mask,
+    is_causal=True,
+)
+
+print(f"{padded_time=:.5f}, padded_peak_memory={padded_peak_memory/1e9:.2f} GB")
+print(f"{nested_time=:.5f}, nested_peak_memory={nested_peak_memory/1e9:.2f} GB")
+print(
+    "Max difference between vanilla and nested result",
+    (padded_result - padded_nested_result).abs().max().item(),
+)
+print(f"Nested speedup: {(padded_time/nested_time):.2f}")
+print(
+    f"Nested peak memory reduction {((padded_peak_memory - nested_peak_memory)/1e9):.2f} GB"
+)
+
+######################################################################################
+# For reference, here are some sample outputs on A100:
+#
+# .. code::
+#
+#     padded_time=0.03454, padded_peak_memory=4.14 GB
+#     nested_time=0.00612, nested_peak_memory=0.76 GB
+#     Max difference between vanilla and nested result 0.0
+#     Nested speedup: 5.65
+#     Nested peak memory reduction 3.39 GB
+#
+# We can also see the same for backward pass
+
+for i, entry_length in enumerate(sentence_lengths):
+    # padding-specific step: remove output projection bias from padded entries for fair comparison
+    padded_result[i, entry_length:, :] = 0.0
+
+_, padded_bw_time, padded_bw_peak_mem = benchmark(
+    lambda: padded_result.sum().backward()
+)
+_, nested_bw_time, nested_bw_peak_mem = benchmark(
+    lambda: padded_nested_result.sum().backward()
+)
+
+print(f"{padded_bw_time=:.5f}, padded_bw_peak_mem={padded_bw_peak_mem/1e9:.2f} GB")
+print(f"{nested_bw_time=:.5f}, nested_bw_peak_mem={nested_bw_peak_mem/1e9:.2f} GB")
+print(f"Nested backward speedup: {(padded_bw_time/nested_bw_time):.2f}")
+print(
+    f"Nested backward peak memory reduction {((padded_bw_peak_mem - nested_bw_peak_mem)/1e9):.2f} GB"
+)
+
+print(
+    "Difference in out_proj.weight.grad",
+    (mha_layer.out_proj.weight.grad - vanilla_mha_layer.out_proj.weight.grad)
+    .abs()
+    .max()
+    .item(),
+)
+print(
+    "Difference in packed_proj.weight.grad",
+    (mha_layer.packed_proj.weight.grad - vanilla_mha_layer.in_proj_weight.grad)
+    .abs()
+    .max()
+    .item(),
+)
+print(
+    "Difference in out_proj.bias.grad",
+    (mha_layer.out_proj.bias.grad - vanilla_mha_layer.out_proj.bias.grad)
+    .abs()
+    .max()
+    .item(),
+)
+print(
+    "Difference in packed_proj.bias.grad",
+    (mha_layer.packed_proj.bias.grad - vanilla_mha_layer.in_proj_bias.grad)
+    .abs()
+    .max()
+    .item(),
+)
+
+##################################################################################
+# Sample outputs on A100:
+#
+# .. code::
+#
+#     padded_bw_time=2.09337, padded_bw_peak_mem=5.10 GB
+#     nested_bw_time=0.01452, nested_bw_peak_mem=3.24 GB
+#     Nested backward speedup: 144.13
+#     Nested backward peak memory reduction 1.86 GB
+#     Difference in out_proj.weight.grad 0.000244140625
+#     Difference in packed_proj.weight.grad 0.001556396484375
+#     Difference in out_proj.bias.grad 0.0
+#     Difference in packed_proj.bias.grad 0.001953125
+#
+
+##################################################################################
+# GPT-style layer
+# ---------------
+# A basic GPT-style transformer layer consists of a causal self-attention layer
+# followed by a feed-forward network (FFN) with skip connections. Implementing
+# this is fairly straightforward using the ``MultiheadAttention`` layer above and
+# gives equivalent results to an ``nn.TransformerEncoderLayer`` with
+# ``is_causal=True``.
+#
+# We  demonstrate examples of implementing the rest of the ``nn`` layers
+# `here <https://github.com/mikaylagawarecki/transformer_tutorial_accompaniment>`_
+# but omit that from this tutorial for brevity.
+
+
+###############################################################################
+# Going one step further
+# ----------------------
+# So far, we have demonstrated how to implement a performant ``MultiheadAttention``
+# layer that follows the traditional ``nn.MultiheadAttention``. Going back to our
+# classification of modifications to the transformer architecture, remember that we
+# classified the modifications into layer type, layer ordering, and modifications
+# to the attention score. We trust that changing layer type and layer ordering
+# (such as swapping ``LayerNorm`` for ``RMSNorm``) is fairly straightforward.
+#
+# In this section, we will discuss various functionalities using the
+# aforementioned building blocks, including the following:
+#
+# * Cross Attention
+# * Fully masked rows no longer cause NaNs
+# * Packed Projection
+
+###############################################################################
+# Cross Attention
+# ---------------
+# Cross attention is a form of attention where the query and key/value tensors
+# are from different sequences.
+#
+# One example of this is in ``nn.TransformerDecoderLayer`` where the query comes
+# from the decoder and the key/value come from the encoder.
+#
+# The above MultiheadAttention layer nicely generalizes to this case with nested
+# tensors for both query and key/value.
+
+query, _, _, q_len = gen_batch(N, E_q, E_k, E_v, device)
+_, key, value, kv_len = gen_batch(N, E_q, E_k, E_v, device)
+
+print(
+    f"Total sequence length in nested query {q_len.sum().item()}, max sequence length {q_len.max().item()}"
+)
+print(
+    f"Total sequence length in nested key/value {kv_len.sum().item()}, max sequence length {kv_len.max().item()}"
+)
+out = new_mha_layer(query, key, value, is_causal=False)
+
+########################################################################################
+# As above, we can compare this against the vanilla compiled ``nn.MultiheadAttention``.
+
+torch.manual_seed(6)
+query, _, _, q_len = gen_batch(N, E_q, E_k, E_v, device)
+_, key, value, kv_len = gen_batch(N, E_q, E_k, E_v, device)
+padded_query, padded_key, padded_value = (
+    t.to_padded_tensor(0.0) for t in (query, key, value)
+)
+
+key_padding_mask = torch.where(padded_key == 0.0, -math.inf, 0)[:, :, 0]
+
+# warmup compile
+warmup_nested_result = new_mha_layer(query, key, value, is_causal=False)
+warmup_vanilla_result = vanilla_mha_layer(
+    padded_query,
+    padded_key,
+    padded_value,
+    key_padding_mask=key_padding_mask,
+    need_weights=False,
+    is_causal=False,
+)
+
+nested_result, nested_time, nested_peak_memory = benchmark(
+    new_mha_layer, query, key, value, is_causal=False
+)
+(padded_result, _), padded_time, padded_peak_memory = benchmark(
+    vanilla_mha_layer,
+    padded_query,
+    padded_key,
+    padded_value,
+    key_padding_mask=key_padding_mask,
+    need_weights=False,
+    is_causal=False,
+)
+padded_nested_result = nested_result.to_padded_tensor(0.0)
+for i, entry_length in enumerate(q_len):
+    # padding-specific step: remove output projection bias from padded entries for fair comparison
+    padded_result[i, entry_length:, :] = 0.0
+
+print(
+    "Max difference between vanilla and nested result",
+    (padded_result - padded_nested_result).abs().max().item(),
+)
+print(f"Nested speedup: {(padded_time/nested_time):.2f}")
+print(
+    f"Nested peak memory reduction {((padded_peak_memory - nested_peak_memory)/1e9):.2f} GB"
+)
+
+##################################################################################
+# Sample outputs on A100:
+#
+# .. code::
+#
+#     Max difference between vanilla and nested result 0.0
+#     Nested speedup: 4.01
+#     Nested peak memory reduction 1.40 GB
+#
+
+################################################################################
+# Fully masked rows no longer cause NaNs
+# --------------------------------------
+#
+# There has been a long standing issue with ``nn.MultiheadAttention`` and
+# ``scaled_dot_product_attention`` where if a row was fully masked out, the output
+# of the attention layer would be NaN. See `issue <https://github.com/pytorch/pytorch/issues/41508>`_.
+# This is because the softmax over an empty set is undefined.
+#
+# Thanks to `this PR <https://github.com/pytorch/pytorch/pull/133882>`_
+# this is no longer the case. Instead, the output corresponding to fully masked rows
+# in ``scaled_dot_product_attention`` will be 0. For cases where ``nn.MHA`` does
+# not employ the "fast-path", this will also apply.
+#
+# Using a custom MHA layer with NJTs is strongly recommended over the
+# existing "fast-path" in ``nn.MultiheadAttention`` as NJT's ability to model raggedness
+# appropriately makes it possible to properly express empty sequences.
+
+
+###############################################################################
+# Packed Projection
+# -----------------
+#
+# Packed projection is a technique that makes use of the fact that when the input
+# for projection (matrix multiplications) are the same (self-attention), we can pack the projection
+# weights and biases into single tensors. It is especially useful when the individual
+# projections are memory bound rather than compute bound. There are
+# two examples that we will demonstrate here:
+#
+# * Input projection for MultiheadAttention
+# * SwiGLU activation in feed-forward network of Transformer Layer
+#
+# Input projection for MultiheadAttention
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# When doing self-attention, the ``query``, ``key``, and ``value``
+# are the same tensor. Each of these tensors is projected with a
+# ``Linear(E_q, E_total)`` layer. Instead, we can pack this into one layer,
+# which is what we do in the MultiheadAttention layer above.
+#
+# Let us compare the performance of the packed projection against the usual method:
+
+
+class InputProjection(nn.Module):
+    def __init__(self, E_q, E_total, bias=False, device=None, dtype=None):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.q_proj = nn.Linear(E_q, E_total, bias=bias, **factory_kwargs)
+        self.k_proj = nn.Linear(E_q, E_total, bias=bias, **factory_kwargs)
+        self.v_proj = nn.Linear(E_q, E_total, bias=bias, **factory_kwargs)
+
+    def forward(self, x):
+        return self.q_proj(x), self.k_proj(x), self.v_proj(x)
+
+
+class PackedInputProjection(nn.Module):
+    def __init__(self, E_q, E_total, bias=False, device=None, dtype=None):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.packed_proj = nn.Linear(E_q, E_total * 3, bias=bias, **factory_kwargs)
+
+    def forward(self, query):
+        return torch.chunk(self.packed_proj(query), 3, dim=-1)
+
+
+B, D, dtype = 256, 8192, torch.bfloat16
+
+torch.set_float32_matmul_precision("high")
+in_proj = torch.compile(InputProjection(D, D, device="cuda", dtype=torch.bfloat16))
+packed_in_proj = torch.compile(
+    PackedInputProjection(D, D, device="cuda", dtype=torch.bfloat16)
+)
+
+q, _, _, sequence_lengths = gen_batch(B, D, D, D, device="cuda", dtype=torch.bfloat16)
+
+# warmup
+in_proj(q)
+packed_in_proj(q)
+
+# benchmark
+(q_out, k_out, v_out), time, _ = benchmark(in_proj, q)
+(q_out, k_out, v_out), time_packed, _ = benchmark(packed_in_proj, q)
+# On my A100 prints 1.05x speedup
+print(
+    f"InputProjection: {time:5f} s, PackedInputProjection: {time_packed:5f} s, speedup: {time/time_packed:.2f}x"
+)
+
+##################################################
+# SwiGLU feed forward network of Transformer Layer
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# Swish-Gated Linear Unit (SwiGLU) is a non-linear activation function that is increasingly popular in the feed-forward
+# network of the transformer layer (e.g. Llama). A feed-forward network with SwiGLU activation is defined as:
+
+
+class SwiGLUFFN(nn.Module):
+    def __init__(
+        self,
+        dim,
+        hidden_dim,
+        multiple_of,
+        ffn_dim_multiplier=None,
+        device=None,
+        dtype=None,
+    ):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        hidden_dim = int(2 * hidden_dim / 3)
+        # custom dim factor multiplier
+        if ffn_dim_multiplier is not None:
+            hidden_dim = int(ffn_dim_multiplier * hidden_dim)
+        hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
+
+        self.w1 = nn.Linear(dim, hidden_dim, bias=False, **factory_kwargs)
+        self.w2 = nn.Linear(hidden_dim, dim, bias=False, **factory_kwargs)
+        self.w3 = nn.Linear(dim, hidden_dim, bias=False, **factory_kwargs)
+
+    def forward(self, x):
+        return self.w2(F.silu(self.w1(x)) * self.w3(x))
+
+
+########################################################################
+# An alternative way of implementing this that uses packed projection is
+
+
+class PackedSwiGLUFFN(nn.Module):
+    def __init__(
+        self,
+        dim,
+        hidden_dim,
+        multiple_of,
+        ffn_dim_multiplier=None,
+        device=None,
+        dtype=None,
+    ):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        hidden_dim = int(2 * hidden_dim / 3)
+        # custom dim factor multiplier
+        if ffn_dim_multiplier is not None:
+            hidden_dim = int(ffn_dim_multiplier * hidden_dim)
+        hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
+
+        self.w13 = nn.Linear(dim, 2 * hidden_dim, bias=False, **factory_kwargs)
+        self.w2 = nn.Linear(hidden_dim, dim, bias=False, **factory_kwargs)
+
+    def forward(self, x):
+        x1, x3 = torch.chunk(self.w13(x), 2, dim=-1)
+        return self.w2(F.silu(x1) * x3)
+
+
+################################################################################
+# We can compare the performance of the two implementations as follows
+# Depending on your hardware, you might see different results. On an A100 I see
+# 1.12x speedup for D=128.
+D = 128
+
+swigluffn = torch.compile(SwiGLUFFN(D, D * 4, 256, device="cuda", dtype=torch.bfloat16))
+packed_swigluffn = torch.compile(
+    PackedSwiGLUFFN(D, D * 4, 256, device="cuda", dtype=torch.bfloat16)
+)
+
+q, _, _, sentence_lengths = gen_batch(D, D, D, D, device="cuda", dtype=torch.bfloat16)
+
+# warmup
+swigluffn(q)
+packed_swigluffn(q)
+
+# benchmark
+_, time, _ = benchmark(swigluffn, q)
+_, time_packed, _ = benchmark(packed_swigluffn, q)
+# On my A100 prints 1.08x speedup
+print(
+    f"SwiGLUFFN: {time} s, PackedSwiGLUFFN: {time_packed} s, speedup: {time/time_packed:.2f}x"
+)
+
+################################################################################
+# Extended examples
+# -----------------
+#
+# We intend to update this tutorial to demonstrate more examples of how to use
+# the various performant building blocks such as KV-Caching, Grouped Query Attention
+# etc. Further, there are several good examples of using various performant building blocks to
+# implement various transformer architectures. Some examples include
+#
+# * `gpt-fast <https://github.com/meta-pytorch/gpt-fast>`_
+# * `segment-anything-fast <https://github.com/meta-pytorch/segment-anything-fast>`_
+# * `lucidrains implementation of NaViT with nested tensors <https://github.com/lucidrains/vit-pytorch/blob/73199ab486e0fad9eced2e3350a11681db08b61b/vit_pytorch/na_vit_nested_tensor.py>`_
+# * `torchtune's implementation of VisionTransformer <https://github.com/pytorch/torchtune/blob/a8a64ec6a99a6ea2be4fdaf0cd5797b03a2567cf/torchtune/modules/vision_transformer.py#L16>`_
+
+################################################################################
+# Conclusion
+# ----------
+#
+# In this tutorial, we have introduced the low level building blocks PyTorch
+# provides for writing transformer layers and demonstrated examples how to compose
+# them. It is our hope that this tutorial has educated the reader on the ease with
+# which flexible and performant transformer layers can be implemented by users of PyTorch.
diff --git a/intermediate_source/visualizing_gradients_tutorial.py b/intermediate_source/visualizing_gradients_tutorial.py
new file mode 100644
index 00000000000..ff78fa3f01a
--- /dev/null
+++ b/intermediate_source/visualizing_gradients_tutorial.py
@@ -0,0 +1,298 @@
+"""
+Visualizing Gradients
+=====================
+
+**Author:** `Justin Silver <https://github.com/j-silv>`__
+
+This tutorial explains how to extract and visualize gradients at any
+layer in a neural network. By inspecting how information flows from the
+end of the network to the parameters we want to optimize, we can debug
+issues such as `vanishing or exploding
+gradients <https://arxiv.org/abs/1211.5063>`__ that occur during
+training.
+
+Before starting, make sure you understand `tensors and how to manipulate
+them <https://docs.pytorch.org/tutorials/beginner/basics/tensorqs_tutorial.html>`__.
+A basic knowledge of `how autograd
+works <https://docs.pytorch.org/tutorials/beginner/basics/autogradqs_tutorial.html>`__
+would also be useful.
+
+"""
+
+
+######################################################################
+# Setup
+# -----
+#
+# First, make sure `PyTorch is
+# installed <https://pytorch.org/get-started/locally/>`__ and then import
+# the necessary libraries.
+#
+
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import torch.nn.functional as F
+import matplotlib.pyplot as plt
+
+
+######################################################################
+# Next, we’ll be creating a network intended for the MNIST dataset,
+# similar to the architecture described by the `batch normalization
+# paper <https://arxiv.org/abs/1502.03167>`__.
+#
+# To illustrate the importance of gradient visualization, we will
+# instantiate one version of the network with batch normalization
+# (BatchNorm), and one without it. Batch normalization is an extremely
+# effective technique to resolve `vanishing/exploding
+# gradients <https://arxiv.org/abs/1211.5063>`__, and we will be verifying
+# that experimentally.
+#
+# The model we use has a configurable number of repeating fully-connected
+# layers which alternate between ``nn.Linear``, ``norm_layer``, and
+# ``nn.Sigmoid``. If batch normalization is enabled, then ``norm_layer``
+# will use
+# `BatchNorm1d <https://docs.pytorch.org/docs/stable/generated/torch.nn.BatchNorm1d.html>`__,
+# otherwise it will use the
+# `Identity <https://docs.pytorch.org/docs/stable/generated/torch.nn.Identity.html>`__
+# transformation.
+#
+
+def fc_layer(in_size, out_size, norm_layer):
+    """Return a stack of linear->norm->sigmoid layers"""
+    return nn.Sequential(nn.Linear(in_size, out_size), norm_layer(out_size), nn.Sigmoid())
+
+class Net(nn.Module):
+    """Define a network that has num_layers of linear->norm->sigmoid transformations"""
+    def __init__(self, in_size=28*28, hidden_size=128,
+                 out_size=10, num_layers=3, batchnorm=False):
+        super().__init__()
+        if batchnorm is False:
+            norm_layer = nn.Identity
+        else:
+            norm_layer = nn.BatchNorm1d
+
+        layers = []
+        layers.append(fc_layer(in_size, hidden_size, norm_layer))
+
+        for i in range(num_layers-1):
+            layers.append(fc_layer(hidden_size, hidden_size, norm_layer))
+
+        layers.append(nn.Linear(hidden_size, out_size))
+
+        self.layers = nn.Sequential(*layers)
+
+    def forward(self, x):
+        x = torch.flatten(x, 1)
+        return self.layers(x)
+
+
+######################################################################
+# Next we set up some dummy data, instantiate two versions of the model,
+# and initialize the optimizers.
+#
+
+# set up dummy data
+x = torch.randn(10, 28, 28)
+y = torch.randint(10, (10, ))
+
+# init model
+model_bn = Net(batchnorm=True, num_layers=3)
+model_nobn = Net(batchnorm=False, num_layers=3)
+
+model_bn.train()
+model_nobn.train()
+
+optimizer_bn = optim.SGD(model_bn.parameters(), lr=0.01, momentum=0.9)
+optimizer_nobn = optim.SGD(model_nobn.parameters(), lr=0.01, momentum=0.9)
+
+
+
+######################################################################
+# We can verify that batch normalization is only being applied to one of
+# the models by probing one of the internal layers:
+#
+
+print(model_bn.layers[0])
+print(model_nobn.layers[0])
+
+
+######################################################################
+# Registering hooks
+# -----------------
+#
+
+
+######################################################################
+# Because we wrapped up the logic and state of our model in a
+# ``nn.Module``, we need another method to access the intermediate
+# gradients if we want to avoid modifying the module code directly. This
+# is done by `registering a
+# hook <https://docs.pytorch.org/docs/stable/notes/autograd.html#backward-hooks-execution>`__.
+#
+# .. warning::
+#
+#    Using backward pass hooks attached to output tensors is preferred over using ``retain_grad()`` on the tensors themselves. An alternative method is to directly attach module hooks (e.g. ``register_full_backward_hook()``) so long as the ``nn.Module`` instance does not do perform any in-place operations. For more information, please refer to `this issue <https://github.com/pytorch/pytorch/issues/61519>`__.
+#
+# The following code defines our hooks and gathers descriptive names for
+# the network’s layers.
+#
+
+# note that wrapper functions are used for Python closure
+# so that we can pass arguments.
+
+def hook_forward(module_name, grads, hook_backward):
+    def hook(module, args, output):
+        """Forward pass hook which attaches backward pass hooks to intermediate tensors"""
+        output.register_hook(hook_backward(module_name, grads))
+    return hook
+
+def hook_backward(module_name, grads):
+    def hook(grad):
+        """Backward pass hook which appends gradients"""
+        grads.append((module_name, grad))
+    return hook
+
+def get_all_layers(model, hook_forward, hook_backward):
+    """Register forward pass hook (which registers a backward hook) to model outputs
+
+    Returns:
+        - layers: a dict with keys as layer/module and values as layer/module names
+                  e.g. layers[nn.Conv2d] = layer1.0.conv1
+        - grads: a list of tuples with module name and tensor output gradient
+                 e.g. grads[0] == (layer1.0.conv1, tensor.Torch(...))
+    """
+    layers = dict()
+    grads = []
+    for name, layer in model.named_modules():
+        # skip Sequential and/or wrapper modules
+        if any(layer.children()) is False:
+            layers[layer] = name
+            layer.register_forward_hook(hook_forward(name, grads, hook_backward))
+    return layers, grads
+
+# register hooks
+layers_bn, grads_bn = get_all_layers(model_bn, hook_forward, hook_backward)
+layers_nobn, grads_nobn = get_all_layers(model_nobn, hook_forward, hook_backward)
+
+
+######################################################################
+# Training and visualization
+# --------------------------
+#
+# Let’s now train the models for a few epochs:
+#
+
+epochs = 10
+
+for epoch in range(epochs):
+
+    # important to clear, because we append to
+    # outputs everytime we do a forward pass
+    grads_bn.clear()
+    grads_nobn.clear()
+
+    optimizer_bn.zero_grad()
+    optimizer_nobn.zero_grad()
+
+    y_pred_bn = model_bn(x)
+    y_pred_nobn = model_nobn(x)
+
+    loss_bn = F.cross_entropy(y_pred_bn, y)
+    loss_nobn = F.cross_entropy(y_pred_nobn, y)
+
+    loss_bn.backward()
+    loss_nobn.backward()
+
+    optimizer_bn.step()
+    optimizer_nobn.step()
+
+
+######################################################################
+# After running the forward and backward pass, the gradients for all the
+# intermediate tensors should be present in ``grads_bn`` and
+# ``grads_nobn``. We compute the mean absolute value of each gradient
+# matrix so that we can compare the two models.
+#
+
+def get_grads(grads):
+    layer_idx = []
+    avg_grads = []
+    for idx, (name, grad) in enumerate(grads):
+        if grad is not None:
+            avg_grad = grad.abs().mean()
+            avg_grads.append(avg_grad)
+            # idx is backwards since we appended in backward pass
+            layer_idx.append(len(grads) - 1 - idx)
+    return layer_idx, avg_grads
+
+layer_idx_bn, avg_grads_bn = get_grads(grads_bn)
+layer_idx_nobn, avg_grads_nobn = get_grads(grads_nobn)
+
+
+######################################################################
+# With the average gradients computed, we can now plot them and see how
+# the values change as a function of the network depth. Notice that when
+# we don’t apply batch normalization, the gradient values in the
+# intermediate layers fall to zero very quickly. The batch normalization
+# model, however, maintains non-zero gradients in its intermediate layers.
+#
+
+fig, ax = plt.subplots()
+ax.plot(layer_idx_bn, avg_grads_bn, label="With BatchNorm", marker="o")
+ax.plot(layer_idx_nobn, avg_grads_nobn, label="Without BatchNorm", marker="x")
+ax.set_xlabel("Layer depth")
+ax.set_ylabel("Average gradient")
+ax.set_title("Gradient flow")
+ax.grid(True)
+ax.legend()
+plt.show()
+
+
+######################################################################
+# Conclusion
+# ----------
+#
+# In this tutorial, we demonstrated how to visualize the gradient flow
+# through a neural network wrapped in a ``nn.Module`` class. We
+# qualitatively showed how batch normalization helps to alleviate the
+# vanishing gradient issue which occurs with deep neural networks.
+#
+# If you would like to learn more about how PyTorch’s autograd system
+# works, please visit the `references <#references>`__ below. If you have
+# any feedback for this tutorial (improvements, typo fixes, etc.) then
+# please use the `PyTorch Forums <https://discuss.pytorch.org/>`__ and/or
+# the `issue tracker <https://github.com/pytorch/tutorials/issues>`__ to
+# reach out.
+#
+
+
+######################################################################
+# (Optional) Additional exercises
+# -------------------------------
+#
+# -  Try increasing the number of layers (``num_layers``) in our model and
+#    see what effect this has on the gradient flow graph
+# -  How would you adapt the code to visualize average activations instead
+#    of average gradients? (*Hint: in the hook_forward() function we have
+#    access to the raw tensor output*)
+# -  What are some other methods to deal with vanishing and exploding
+#    gradients?
+#
+
+
+######################################################################
+# References
+# ----------
+#
+# -  `A Gentle Introduction to
+#    torch.autograd <https://docs.pytorch.org/tutorials/beginner/blitz/autograd_tutorial.html>`__
+# -  `Automatic Differentiation with
+#    torch.autograd <https://docs.pytorch.org/tutorials/beginner/basics/autogradqs_tutorial>`__
+# -  `Autograd
+#    mechanics <https://docs.pytorch.org/docs/stable/notes/autograd.html>`__
+# -  `Batch Normalization: Accelerating Deep Network Training by Reducing
+#    Internal Covariate Shift <https://arxiv.org/abs/1502.03167>`__
+# -  `On the difficulty of training Recurrent Neural
+#    Networks <https://arxiv.org/abs/1211.5063>`__
+#
\ No newline at end of file
diff --git a/intro.rst b/intro.rst
new file mode 100644
index 00000000000..1f9c8ceb4ef
--- /dev/null
+++ b/intro.rst
@@ -0,0 +1,66 @@
+Intro
+=====
+
+This is a collection of beginner-friendly resources to help you get
+started with PyTorch. These tutorials cover fundamental concepts,
+basic operations, and essential workflows to build a solid foundation
+for your deep learning journey. Perfect for newcomers looking to
+understand PyTorch's core functionality through step-by-step guidance.
+
+.. grid:: 2
+
+    .. grid-item-card:: :octicon:`file-code;1em`
+       Learn the Basics
+       :link: https://pytorch.org/tutorials/beginner/basics/intro.html
+       :link-type: url
+
+       A step-by-step interactive series for those just starting out
+       with PyTorch.
+       +++
+       :octicon:`code;1em` Code
+
+    .. grid-item-card:: :octicon:`file-code;1em`
+       Introduction to PyTorch - YouTube Series
+       :link: https://pytorch.org/tutorials/beginner/introyt/introyt_index.html
+       :link-type: url
+
+       Learn the fundamentals of PyTorch through our video series,
+       perfect for those who prefer learning from videos.
+       +++
+       :octicon:`code;1em` Code :octicon:`square-fill;1em` :octicon:`video;1em` Video
+
+.. grid:: 2
+
+    .. grid-item-card:: :octicon:`file-code;1em`
+       Learning PyTorch
+       :link: https://pytorch.org/tutorials/beginner/deep_learning_60min_blitz.html
+       :link-type: url
+
+       Quickly grasp the basics of PyTorch with these bite-sized
+       foundational recipes.
+       +++
+       :octicon:`code;1em` Code
+
+.. toctree::
+   :maxdepth: 1
+   :hidden:
+   :includehidden:
+   :caption: Getting Started with PyTorch
+
+   beginner/basics/intro
+   beginner/introyt/introyt_index
+
+.. toctree::
+   :maxdepth: 1
+   :hidden:
+   :includehidden:
+   :caption: Learning PyTorch
+
+   beginner/deep_learning_60min_blitz
+   beginner/pytorch_with_examples
+   beginner/nn_tutorial
+   beginner/understanding_leaf_vs_nonleaf_tutorial
+   intermediate/nlp_from_scratch_index
+   intermediate/tensorboard_tutorial
+   intermediate/pinmem_nonblock
+   intermediate/visualizing_gradients_tutorial
diff --git a/lychee.toml b/lychee.toml
new file mode 100644
index 00000000000..b07496f7876
--- /dev/null
+++ b/lychee.toml
@@ -0,0 +1 @@
+exclude_path = [".jenkins/build.sh", "_static/img/", "_static/images/"]
diff --git a/recipes_index.rst b/recipes_index.rst
new file mode 100644
index 00000000000..b8cb9089e44
--- /dev/null
+++ b/recipes_index.rst
@@ -0,0 +1,356 @@
+Recipes
+========
+
+Recipes are bite-sized, actionable examples of
+how to use specific PyTorch features, different
+from our full-length tutorials.
+
+.. raw:: html
+
+   <div id="tutorial-cards-container">
+
+    <nav class="navbar navbar-expand-lg navbar-light tutorials-nav col-12">
+        <div class="tutorial-tags-container">
+            <div id="dropdown-filter-tags">
+                <div class="tutorial-filter-menu">
+                    <div class="tutorial-filter filter-btn all-tag-selected" data-tag="all">All</div>
+                </div>
+            </div>
+        </div>
+    </nav>
+
+    <hr class="tutorials-hr">
+
+    <div class="row">
+
+    <div id="tutorial-cards">
+    <div class="list">
+
+.. Add recipe cards below this line
+
+.. Basics
+
+.. customcarditem::
+   :header: Defining a Neural Network
+   :card_description: Learn how to use PyTorch's torch.nn package to create and define a neural network for the MNIST dataset.
+   :image: _static/img/thumbnails/cropped/defining-a-network.PNG
+   :link: recipes/recipes/defining_a_neural_network.html
+   :tags: Basics
+
+.. customcarditem::
+   :header: What is a state_dict in PyTorch
+   :card_description: Learn how state_dict objects and Python dictionaries are used in saving or loading models from PyTorch.
+   :image: _static/img/thumbnails/cropped/what-is-a-state-dict.PNG
+   :link: recipes/recipes/what_is_state_dict.html
+   :tags: Basics
+
+
+.. customcarditem::
+   :header: Warmstarting model using parameters from a different model in PyTorch
+   :card_description: Learn how warmstarting the training process by partially loading a model or loading a partial model can help your model converge much faster than training from scratch.
+   :image: _static/img/thumbnails/cropped/warmstarting-models.PNG
+   :link: recipes/recipes/warmstarting_model_using_parameters_from_a_different_model.html
+   :tags: Basics
+
+.. customcarditem::
+   :header: Zeroing out gradients in PyTorch
+   :card_description: Learn when you should zero out gradients and how doing so can help increase the accuracy of your model.
+   :image: _static/img/thumbnails/cropped/zeroing-out-gradients.PNG
+   :link: recipes/recipes/zeroing_out_gradients.html
+   :tags: Basics
+
+.. customcarditem::
+   :header: PyTorch Benchmark
+   :card_description: Learn how to use PyTorch's benchmark module to measure and compare the performance of your code
+   :image: _static/img/thumbnails/cropped/profiler.png
+   :link: recipes/recipes/benchmark.html
+   :tags: Basics
+
+.. customcarditem::
+   :header: PyTorch Benchmark (quick start)
+   :card_description: Learn how to measure snippet run times and collect instructions.
+   :image: _static/img/thumbnails/cropped/profiler.png
+   :link: recipes/recipes/timer_quick_start.html
+   :tags: Basics
+
+.. customcarditem::
+   :header: PyTorch Profiler
+   :card_description: Learn how to use PyTorch's profiler to measure operators time and memory consumption
+   :image: _static/img/thumbnails/cropped/profiler.png
+   :link: recipes/recipes/profiler_recipe.html
+   :tags: Basics
+
+.. customcarditem::
+   :header: PyTorch Profiler with Instrumentation and Tracing Technology API (ITT API) support
+   :card_description: Learn how to use PyTorch's profiler with Instrumentation and Tracing Technology API (ITT API) to visualize operators labeling in Intel® VTune™ Profiler GUI
+   :image: _static/img/thumbnails/cropped/profiler.png
+   :link: recipes/profile_with_itt.html
+   :tags: Basics
+
+.. customcarditem::
+   :header: Dynamic Compilation Control with ``torch.compiler.set_stance``
+   :card_description: Learn how to use torch.compiler.set_stance
+   :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: recipes/torch_compiler_set_stance_tutorial.html
+   :tags: Compiler
+
+.. customcarditem::
+   :header: Reasoning about Shapes in PyTorch
+   :card_description: Learn how to use the meta device to reason about shapes in your model.
+   :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: recipes/recipes/reasoning_about_shapes.html
+   :tags: Basics
+
+.. customcarditem::
+   :header: Tips for Loading an nn.Module from a Checkpoint
+   :card_description: Learn tips for loading an nn.Module from a checkpoint.
+   :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: recipes/recipes/module_load_state_dict_tips.html
+   :tags: Basics
+
+.. customcarditem::
+   :header: (beta) Using TORCH_LOGS to observe torch.compile
+   :card_description: Learn how to use the torch logging APIs to observe the compilation process.
+   :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: recipes/torch_logs.html
+   :tags: Basics
+
+.. customcarditem::
+   :header: Extension points in nn.Module for loading state_dict and tensor subclasses
+   :card_description: New extension points in nn.Module.
+   :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: recipes/recipes/swap_tensors.html
+   :tags: Basics
+
+.. customcarditem::
+   :header: torch.export AOTInductor Tutorial for Python runtime
+   :card_description: Learn an end-to-end example of how to use AOTInductor for python runtime.
+   :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: recipes/torch_export_aoti_python.html
+   :tags: Basics
+
+.. customcarditem::
+   :header: Demonstration of torch.export flow, common challenges and the solutions to address them
+   :card_description: Learn how to export models for popular usecases
+   :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: recipes/torch_export_challenges_solutions.html
+   :tags: Compiler,TorchCompile
+
+.. Interpretability
+
+.. customcarditem::
+   :header: Model Interpretability using Captum
+   :card_description: Learn how to use Captum attribute the predictions of an image classifier to their corresponding image features and visualize the attribution results.
+   :image: _static/img/thumbnails/cropped/model-interpretability-using-captum.png
+   :link: recipes/recipes/Captum_Recipe.html
+   :tags: Interpretability,Captum
+
+.. customcarditem::
+   :header: How to use TensorBoard with PyTorch
+   :card_description: Learn basic usage of TensorBoard with PyTorch, and how to visualize data in TensorBoard UI
+   :image: _static/img/thumbnails/tensorboard_scalars.png
+   :link: recipes/recipes/tensorboard_with_pytorch.html
+   :tags: Visualization,TensorBoard
+
+.. Automatic Mixed Precision
+
+.. customcarditem::
+   :header: Automatic Mixed Precision
+   :card_description: Use torch.cuda.amp to reduce runtime and save memory on NVIDIA GPUs.
+   :image: _static/img/thumbnails/cropped/amp.png
+   :link: recipes/recipes/amp_recipe.html
+   :tags: Model-Optimization
+
+.. Performance
+
+.. customcarditem::
+   :header: Performance Tuning Guide
+   :card_description: Tips for achieving optimal performance.
+   :image: _static/img/thumbnails/cropped/profiler.png
+   :link: recipes/recipes/tuning_guide.html
+   :tags: Model-Optimization
+
+.. customcarditem::
+   :header: Optimizing CPU Performance on Intel® Xeon® with run_cpu Script
+   :card_description: How to use run_cpu script for optimal runtime configurations on Intel® Xeon CPUs.
+   :image: _static/img/thumbnails/cropped/profiler.png
+   :link: recipes/xeon_run_cpu.html
+   :tags: Model-Optimization
+
+
+.. (beta) Utilizing Torch Function modes with torch.compile
+
+.. customcarditem::
+   :header: (beta) Utilizing Torch Function modes with torch.compile
+   :card_description: Override torch operators with Torch Function modes and torch.compile
+   :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: recipes/torch_compile_torch_function_modes.html
+   :tags: Model-Optimization
+
+.. (beta) Compiling the Optimizer with torch.compile
+
+.. customcarditem::
+   :header: (beta) Compiling the Optimizer with torch.compile
+   :card_description: Speed up the optimizer using torch.compile
+   :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: recipes/compiling_optimizer.html
+   :tags: Model-Optimization
+
+.. (beta) Running the compiled optimizer with an LR Scheduler
+
+.. customcarditem::
+   :header: (beta) Running the compiled optimizer with an LR Scheduler
+   :card_description: Speed up training with LRScheduler and torch.compiled optimizer
+   :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: recipes/compiling_optimizer_lr_scheduler.html
+   :tags: Model-Optimization
+
+.. (beta) Explicit horizontal fusion with foreach_map and torch.compile
+.. customcarditem::
+   :header: (beta) Explicit horizontal fusion with foreach_map and torch.compile
+   :card_description: Horizontally fuse pointwise ops with torch.compile
+   :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: recipes/foreach_map.py
+   :tags: Model-Optimization
+
+.. Using User-Defined Triton Kernels with ``torch.compile``
+
+.. customcarditem::
+   :header: Using User-Defined Triton Kernels with ``torch.compile``
+   :card_description: Learn how to use user-defined kernels with ``torch.compile``
+   :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: recipes/torch_compile_user_defined_triton_kernel_tutorial.html
+   :tags: Model-Optimization
+
+.. Compile Time Caching in ``torch.compile``
+
+.. customcarditem::
+   :header: Compile Time Caching in ``torch.compile``
+   :card_description: Learn how to use compile time caching in ``torch.compile``
+   :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: recipes/torch_compile_caching_tutorial.html
+   :tags: Model-Optimization
+
+.. Compile Time Caching Configurations
+
+.. customcarditem::
+   :header: Compile Time Caching Configurations
+   :card_description: Learn how to configure compile time caching in ``torch.compile``
+   :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: recipes/torch_compile_caching_configuration_tutorial.html
+   :tags: Model-Optimization
+
+.. Reducing Cold Start Compilation Time with Regional Compilation
+
+.. customcarditem::
+   :header: Reducing torch.compile cold start compilation time with regional compilation
+   :card_description: Learn how to use regional compilation to control cold start compile time
+   :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: recipes/regional_compilation.html
+   :tags: Model-Optimization
+
+.. Intel(R) Neural Compressor for PyTorch*
+
+.. customcarditem::
+   :header: Intel® Neural Compressor for PyTorch
+   :card_description: Ease-of-use quantization for PyTorch with Intel® Neural Compressor.
+   :image: _static/img/thumbnails/cropped/profiler.png
+   :link: recipes/intel_neural_compressor_for_pytorch.html
+   :tags: Quantization,Model-Optimization
+
+.. Distributed Training
+
+.. customcarditem::
+   :header: Getting Started with DeviceMesh
+   :card_description: Learn how to use DeviceMesh
+   :image: _static/img/thumbnails/cropped/profiler.png
+   :link: recipes/distributed_device_mesh.html
+   :tags: Distributed-Training
+
+.. customcarditem::
+   :header: Shard Optimizer States with ZeroRedundancyOptimizer
+   :card_description: How to use ZeroRedundancyOptimizer to reduce memory consumption.
+   :image: _static/img/thumbnails/cropped/profiler.png
+   :link: recipes/zero_redundancy_optimizer.html
+   :tags: Distributed-Training
+
+.. customcarditem::
+   :header: Direct Device-to-Device Communication with TensorPipe RPC
+   :card_description: How to use RPC with direct GPU-to-GPU communication.
+   :image: _static/img/thumbnails/cropped/profiler.png
+   :link: recipes/cuda_rpc.html
+   :tags: Distributed-Training
+
+.. customcarditem::
+   :header: Getting Started with Distributed Checkpoint (DCP)
+   :card_description: Learn how to checkpoint distributed models with Distributed Checkpoint package.
+   :image: _static/img/thumbnails/cropped/Getting-Started-with-DCP.png
+   :link: recipes/distributed_checkpoint_recipe.html
+   :tags: Distributed-Training
+
+.. customcarditem::
+   :header: Asynchronous Checkpointing (DCP)
+   :card_description: Learn how to checkpoint distributed models with Distributed Checkpoint package.
+   :image: _static/img/thumbnails/cropped/Getting-Started-with-DCP.png
+   :link: recipes/distributed_async_checkpoint_recipe.html
+   :tags: Distributed-Training
+
+.. customcarditem::
+   :header: Getting Started with CommDebugMode
+   :card_description: Learn how to use CommDebugMode for DTensors
+   :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: recipes/distributed_comm_debug_mode.html
+   :tags: Distributed-Training
+
+.. customcarditem::
+   :header: Reducing AoT cold start compilation time with regional compilation
+   :card_description: Learn how to use regional compilation to control AoT cold start compile time
+   :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: recipes/regional_aot.html
+   :tags: Model-Optimization
+
+.. End of tutorial card section
+
+.. -----------------------------------------
+.. Page TOC
+.. -----------------------------------------
+.. toctree::
+   :maxdepth: 2
+   :hidden:
+
+   recipes/recipes/defining_a_neural_network
+   recipes/torch_logs
+   recipes/recipes/what_is_state_dict
+   recipes/recipes/warmstarting_model_using_parameters_from_a_different_model
+   recipes/recipes/zeroing_out_gradients
+   recipes/recipes/profiler_recipe
+   recipes/recipes/profile_with_itt
+   recipes/recipes/Captum_Recipe
+   recipes/recipes/tensorboard_with_pytorch
+   recipes/recipes/dynamic_quantization
+   recipes/recipes/amp_recipe
+   recipes/recipes/tuning_guide
+   recipes/recipes/xeon_run_cpu
+   recipes/compiling_optimizer
+   recipes/recipes/timer_quick_start
+   recipes/zero_redundancy_optimizer
+   recipes/distributed_comm_debug_mode
+   recipes/torch_export_challenges_solutions
+   recipes/recipes/benchmark
+   recipes/recipes/module_load_state_dict_tips
+   recipes/recipes/reasoning_about_shapes
+   recipes/recipes/swap_tensors
+   recipes/torch_export_aoti_python
+   recipes/recipes/tensorboard_with_pytorch
+   recipes/torch_compile_torch_function_modes
+   recipes/compiling_optimizer_lr_scheduler
+   recipes/foreach_map
+   recipes/torch_compile_user_defined_triton_kernel_tutorial
+   recipes/torch_compile_caching_tutorial
+   recipes/torch_compile_caching_configuration_tutorial
+   recipes/regional_compilation
+   recipes/regional_aot
+   recipes/intel_neural_compressor_for_pytorch
+   recipes/distributed_device_mesh
+   recipes/distributed_checkpoint_recipe
+   recipes/distributed_async_checkpoint_recipe
diff --git a/recipes_source/README.txt b/recipes_source/README.txt
new file mode 100644
index 00000000000..bbc30dc3a6d
--- /dev/null
+++ b/recipes_source/README.txt
@@ -0,0 +1,7 @@
+Recipes
+------------------
+1. recipes/* and recipes_index.rst
+	   PyTorch Recipes
+	   https://pytorch.org/tutorials/recipes_index.html
+	   
+
diff --git a/recipes_source/android_native_app_with_custom_op.rst b/recipes_source/android_native_app_with_custom_op.rst
new file mode 100644
index 00000000000..c9dbc093b21
--- /dev/null
+++ b/recipes_source/android_native_app_with_custom_op.rst
@@ -0,0 +1,10 @@
+Making Native Android Application that uses PyTorch prebuilt libraries
+======================================================================
+
+PyTorch Mobile is no longer actively supported. Please check out `ExecuTorch <https://github.com/pytorch/executorch>`__.
+
+Redirecting in 3 seconds...
+
+.. raw:: html
+
+    <meta http-equiv="Refresh" content="3; url='https://pytorch.org/executorch/stable/index.html'" />
diff --git a/recipes_source/compiling_optimizer.rst b/recipes_source/compiling_optimizer.rst
new file mode 100644
index 00000000000..951495ca4fa
--- /dev/null
+++ b/recipes_source/compiling_optimizer.rst
@@ -0,0 +1,94 @@
+(beta) Compiling the optimizer with torch.compile
+==========================================================================================
+
+**Author:** `Michael Lazos <https://github.com/mlazos>`_
+
+The optimizer is a key algorithm for training any deep learning model.
+Since it is responsible for updating every model parameter, it can often
+become the bottleneck in training performance for large models. In this recipe, 
+we will apply ``torch.compile`` to the optimizer to observe the GPU performance 
+improvement.
+
+.. note::
+
+   This tutorial requires PyTorch 2.2.0 or later.
+
+Model Setup
+~~~~~~~~~~~~~~~~~~~~~
+For this example, we'll use a simple sequence of linear layers.
+Since we are only benchmarking the optimizer, the choice of model doesn't matter
+because optimizer performance is a function of the number of parameters.
+
+Depending on what machine you are using, your exact results may vary.
+
+.. code-block:: python
+
+   import torch
+   
+   model = torch.nn.Sequential(
+       *[torch.nn.Linear(1024, 1024, False, device="cuda") for _ in range(10)]
+   )
+   input = torch.rand(1024, device="cuda")
+   output = model(input)
+   output.sum().backward()
+
+Setting up and running the optimizer benchmark
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+In this example, we'll use the Adam optimizer
+and create a helper function to wrap the step()
+in ``torch.compile()``.
+
+.. note::
+   
+   ``torch.compile`` is only supported on cuda devices with compute capability >= 7.0
+
+.. code-block:: python
+
+   # exit cleanly if we are on a device that doesn't support torch.compile
+   if torch.cuda.get_device_capability() < (7, 0):
+       print("Exiting because torch.compile is not supported on this device.")
+       import sys
+       sys.exit(0)
+
+
+   opt = torch.optim.Adam(model.parameters(), lr=0.01)
+
+
+   @torch.compile(fullgraph=False)
+   def fn():
+       opt.step()
+   
+   
+   # Let's define a helpful benchmarking function:
+   import torch.utils.benchmark as benchmark
+   
+   
+   def benchmark_torch_function_in_microseconds(f, *args, **kwargs):
+       t0 = benchmark.Timer(
+           stmt="f(*args, **kwargs)", globals={"args": args, "kwargs": kwargs, "f": f}
+       )
+       return t0.blocked_autorange().mean * 1e6
+
+
+   # Warmup runs to compile the function
+   for _ in range(5):
+       fn()
+   
+   eager_runtime = benchmark_torch_function_in_microseconds(opt.step)
+   compiled_runtime = benchmark_torch_function_in_microseconds(fn)
+   
+   assert eager_runtime > compiled_runtime
+   
+   print(f"eager runtime: {eager_runtime}us")
+   print(f"compiled runtime: {compiled_runtime}us")
+
+Sample Results:
+
+* Eager runtime: 747.2437149845064us
+* Compiled runtime: 392.07384741178us
+
+See Also
+~~~~~~~~~
+
+* For an in-depth technical overview, see
+`Compiling the optimizer with PT2 <https://dev-discuss.pytorch.org/t/compiling-the-optimizer-with-pt2/1669>`__
diff --git a/recipes_source/compiling_optimizer_lr_scheduler.py b/recipes_source/compiling_optimizer_lr_scheduler.py
new file mode 100644
index 00000000000..c0402729403
--- /dev/null
+++ b/recipes_source/compiling_optimizer_lr_scheduler.py
@@ -0,0 +1,117 @@
+"""
+(beta) Running the compiled optimizer with an LR Scheduler
+============================================================
+
+**Author:** `Michael Lazos <https://github.com/mlazos>`_
+"""
+
+#########################################################
+# The optimizer is a key algorithm for training any deep learning model.
+# In this example, we will show how to pair the optimizer, which has been compiled using ``torch.compile``,
+# with the LR schedulers to accelerate training convergence.
+#
+# .. note::
+#
+#    This tutorial requires PyTorch 2.3.0 or later.
+
+#####################################################################
+# Model Setup
+# ~~~~~~~~~~~~~~~~~~~~~
+# For this example, we'll use a simple sequence of linear layers.
+#
+
+import torch
+
+# Create simple model
+model = torch.nn.Sequential(
+    *[torch.nn.Linear(1024, 1024, False, device="cuda") for _ in range(10)]
+)
+input = torch.rand(1024, device="cuda")
+
+# run forward pass
+output = model(input)
+
+# run backward to populate the grads for our optimizer below
+output.sum().backward()
+
+
+#####################################################################
+# Setting up and running the compiled optimizer with LR Scheduler
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# In this section, we'll use the Adam optimizer with LinearLR Scheduler
+# and create a helper function to wrap the ``step()`` call for each of them
+# in ``torch.compile()``.
+#
+# .. note::
+#
+#    ``torch.compile`` is only supported on CUDA devices that have a compute capability of 7.0 or higher.
+
+
+# exit cleanly if we are on a device that doesn't support ``torch.compile``
+if torch.cuda.get_device_capability() < (7, 0):
+    print("Exiting because torch.compile is not supported on this device.")
+    import sys
+    sys.exit(0)
+
+# !!! IMPORTANT !!! Wrap the lr in a Tensor if we are pairing the
+# the optimizer with an LR Scheduler.
+# Without this, torch.compile will recompile as the value of the LR
+# changes.
+opt = torch.optim.Adam(model.parameters(), lr=torch.tensor(0.01))
+sched = torch.optim.lr_scheduler.LinearLR(opt, total_iters=5)
+
+@torch.compile(fullgraph=False)
+def fn():
+    opt.step()
+    sched.step()
+
+
+# Warmup runs to compile the function
+for _ in range(5):
+    fn()
+    print(opt.param_groups[0]["lr"])
+
+
+######################################################################
+# Extension: What happens with a non-tensor LR?
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# For the curious, we will show how to peek into what happens with ``torch.compile`` when we don't wrap the
+# LR in a tensor.
+
+# No longer wrap the LR in a tensor here
+opt = torch.optim.Adam(model.parameters(), lr=0.01)
+sched = torch.optim.lr_scheduler.LinearLR(opt, total_iters=5)
+
+@torch.compile(fullgraph=False)
+def fn():
+    opt.step()
+    sched.step()
+
+# Setup logging to view recompiles
+torch._logging.set_logs(recompiles=True)
+
+# Warmup runs to compile the function
+# We will now recompile on each iteration
+# as the value of the lr is mutated.
+for _ in range(5):
+    fn()
+
+
+######################################################################
+# With this example, we can see that we recompile the optimizer a few times
+# due to the guard failure on the ``lr`` in ``param_groups[0]``.
+
+######################################################################
+# Conclusion
+# ~~~~~~~~~~
+#
+# In this tutorial we showed how to pair the optimizer compiled with ``torch.compile``
+# with an LR Scheduler to accelerate training convergence. We used a model consisting
+# of a simple sequence of linear layers with the Adam optimizer paired
+# with a LinearLR scheduler to demonstrate the LR changing across iterations.
+#
+# See also:
+#
+# * `Compiled optimizer tutorial <https://pytorch.org/tutorials/recipes/compiling_optimizer.html>`__ - an intro into the compiled optimizer.
+# * `Compiling the optimizer with PT2 <https://dev-discuss.pytorch.org/t/compiling-the-optimizer-with-pt2/1669>`__ - deeper technical details on the compiled optimizer. 
diff --git a/recipes_source/distributed_async_checkpoint_recipe.rst b/recipes_source/distributed_async_checkpoint_recipe.rst
new file mode 100644
index 00000000000..e959883a25b
--- /dev/null
+++ b/recipes_source/distributed_async_checkpoint_recipe.rst
@@ -0,0 +1,289 @@
+Asynchronous Saving with Distributed Checkpoint (DCP)
+=====================================================
+
+**Author:** `Lucas Pasqualin <https://github.com/lucasllc>`__, `Iris Zhang <https://github.com/wz337>`__, `Rodrigo Kumpera <https://github.com/kumpera>`__, `Chien-Chin Huang <https://github.com/fegin>`__
+
+Checkpointing is often a bottle-neck in the critical path for distributed training workloads, incurring larger and larger costs as both model and world sizes grow.
+One excellent strategy for offsetting this cost is to checkpoint in parallel, asynchronously. Below, we expand the save example
+from the `Getting Started with Distributed Checkpoint Tutorial <https://github.com/pytorch/tutorials/blob/main/recipes_source/distributed_checkpoint_recipe.rst>`__
+to show how this can be integrated quite easily with ``torch.distributed.checkpoint.async_save``.
+
+
+.. grid:: 2
+
+    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn
+       :class-card: card-prerequisites
+
+       * How to use DCP to generate checkpoints in parallel
+       * Effective strategies to optimize performance
+
+    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites
+       :class-card: card-prerequisites
+
+       * PyTorch v2.4.0 or later
+       * `Getting Started with Distributed Checkpoint Tutorial <https://github.com/pytorch/tutorials/blob/main/recipes_source/distributed_checkpoint_recipe.rst>`__
+
+
+Asynchronous Checkpointing Overview
+------------------------------------
+Before getting started with Asynchronous Checkpointing, it's important to understand it's differences and limitations as compared to synchronous checkpointing.
+Specifically:
+
+* Memory requirements - Asynchronous checkpointing works by first copying models into internal CPU-buffers.
+    This is helpful since it ensures model and optimizer weights are not changing while the model is still checkpointing,
+    but does raise CPU memory by a factor of ``checkpoint_size_per_rank X number_of_ranks``. Additionally, users should take care to understand
+    the memory constraints of their systems. Specifically, pinned memory implies the usage of ``page-lock`` memory, which can be scarce as compared to
+    ``pageable`` memory.
+
+* Checkpoint Management - Since checkpointing is asynchronous, it is up to the user to manage concurrently run checkpoints. In general, users can
+    employ their own management strategies by handling the future object returned form ``async_save``. For most users, we recommend limiting
+    checkpoints to one asynchronous request at a time, avoiding additional memory pressure per request.
+
+
+
+.. code-block:: python
+
+    import os
+
+    import torch
+    import torch.distributed as dist
+    import torch.distributed.checkpoint as dcp
+    import torch.multiprocessing as mp
+    import torch.nn as nn
+
+    from torch.distributed.fsdp import fully_shard
+    from torch.distributed.checkpoint.state_dict import get_state_dict, set_state_dict
+    from torch.distributed.checkpoint.stateful import Stateful
+
+    CHECKPOINT_DIR = "checkpoint"
+
+
+    class AppState(Stateful):
+        """This is a useful wrapper for checkpointing the Application State. Since this object is compliant
+        with the Stateful protocol, DCP will automatically call state_dict/load_stat_dict as needed in the
+        dcp.save/load APIs.
+
+        Note: We take advantage of this wrapper to hande calling distributed state dict methods on the model
+        and optimizer.
+        """
+
+        def __init__(self, model, optimizer=None):
+            self.model = model
+            self.optimizer = optimizer
+
+        def state_dict(self):
+            # this line automatically manages FSDP FQN's, as well as sets the default state dict type to FSDP.SHARDED_STATE_DICT
+            model_state_dict, optimizer_state_dict = get_state_dict(self.model, self.optimizer)
+            return {
+                "model": model_state_dict,
+                "optim": optimizer_state_dict
+            }
+
+        def load_state_dict(self, state_dict):
+            # sets our state dicts on the model and optimizer, now that we've loaded
+            set_state_dict(
+                self.model,
+                self.optimizer,
+                model_state_dict=state_dict["model"],
+                optim_state_dict=state_dict["optim"]
+            )
+
+    class ToyModel(nn.Module):
+        def __init__(self):
+            super(ToyModel, self).__init__()
+            self.net1 = nn.Linear(16, 16)
+            self.relu = nn.ReLU()
+            self.net2 = nn.Linear(16, 8)
+
+        def forward(self, x):
+            return self.net2(self.relu(self.net1(x)))
+
+
+    def setup(rank, world_size):
+        os.environ["MASTER_ADDR"] = "localhost"
+        os.environ["MASTER_PORT"] = "12355 "
+
+        # initialize the process group
+        dist.init_process_group("gloo", rank=rank, world_size=world_size)
+        torch.cuda.set_device(rank)
+
+
+    def cleanup():
+        dist.destroy_process_group()
+
+
+    def run_fsdp_checkpoint_save_example(rank, world_size):
+        print(f"Running basic FSDP checkpoint saving example on rank {rank}.")
+        setup(rank, world_size)
+
+        # create a model and move it to GPU with id rank
+        model = ToyModel().to(rank)
+        model = fully_shard(model)
+
+        loss_fn = nn.MSELoss()
+        optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
+
+        checkpoint_future = None
+        for step in range(10):
+            optimizer.zero_grad()
+            model(torch.rand(8, 16, device="cuda")).sum().backward()
+            optimizer.step()
+
+            # waits for checkpointing to finish if one exists, avoiding queuing more then one checkpoint request at a time
+            if checkpoint_future is not None:
+                checkpoint_future.result()
+
+            state_dict = { "app": AppState(model, optimizer) }
+            checkpoint_future = dcp.async_save(state_dict, checkpoint_id=f"{CHECKPOINT_DIR}_step{step}")
+
+        cleanup()
+
+
+    if __name__ == "__main__":
+        world_size = torch.cuda.device_count()
+        print(f"Running async checkpoint example on {world_size} devices.")
+        mp.spawn(
+            run_fsdp_checkpoint_save_example,
+            args=(world_size,),
+            nprocs=world_size,
+            join=True,
+        )
+
+
+Even more performance with Pinned Memory
+-----------------------------------------
+If the above optimization is still not performant enough, you can take advantage of an additional optimization for GPU models which utilizes a pinned memory buffer for checkpoint staging.
+Specifically, this optimization attacks the main overhead of asynchronous checkpointing, which is the in-memory copying to checkpointing buffers. By maintaining a pinned memory buffer between
+checkpoint requests users can take advantage of direct memory access to speed up this copy.
+
+.. note::
+   The main drawback of this optimization is the persistence of the buffer in between checkpointing steps. Without
+   the pinned memory optimization (as demonstrated above), any checkpointing buffers are released as soon as
+   checkpointing is finished. With the pinned memory implementation, this buffer is maintained between steps,
+   leading to the same
+   peak memory pressure being sustained through the application life.
+
+
+.. code-block:: python
+
+    import os
+
+    import torch
+    import torch.distributed as dist
+    import torch.distributed.checkpoint as dcp
+    import torch.multiprocessing as mp
+    import torch.nn as nn
+
+    from torch.distributed.fsdp import fully_shard
+    from torch.distributed.checkpoint.state_dict import get_state_dict, set_state_dict
+    from torch.distributed.checkpoint.stateful import Stateful
+    from torch.distributed.checkpoint import FileSystemWriter as StorageWriter
+
+    CHECKPOINT_DIR = "checkpoint"
+
+
+    class AppState(Stateful):
+        """This is a useful wrapper for checkpointing the Application State. Since this object is compliant
+        with the Stateful protocol, DCP will automatically call state_dict/load_stat_dict as needed in the
+        dcp.save/load APIs.
+
+        Note: We take advantage of this wrapper to hande calling distributed state dict methods on the model
+        and optimizer.
+        """
+
+        def __init__(self, model, optimizer=None):
+            self.model = model
+            self.optimizer = optimizer
+
+        def state_dict(self):
+            # this line automatically manages FSDP FQN's, as well as sets the default state dict type to FSDP.SHARDED_STATE_DICT
+            model_state_dict, optimizer_state_dict = get_state_dict(self.model, self.optimizer)
+            return {
+                "model": model_state_dict,
+                "optim": optimizer_state_dict
+            }
+
+        def load_state_dict(self, state_dict):
+            # sets our state dicts on the model and optimizer, now that we've loaded
+            set_state_dict(
+                self.model,
+                self.optimizer,
+                model_state_dict=state_dict["model"],
+                optim_state_dict=state_dict["optim"]
+            )
+
+    class ToyModel(nn.Module):
+        def __init__(self):
+            super(ToyModel, self).__init__()
+            self.net1 = nn.Linear(16, 16)
+            self.relu = nn.ReLU()
+            self.net2 = nn.Linear(16, 8)
+
+        def forward(self, x):
+            return self.net2(self.relu(self.net1(x)))
+
+
+    def setup(rank, world_size):
+        os.environ["MASTER_ADDR"] = "localhost"
+        os.environ["MASTER_PORT"] = "12355 "
+
+        # initialize the process group
+        dist.init_process_group("gloo", rank=rank, world_size=world_size)
+        torch.cuda.set_device(rank)
+
+
+    def cleanup():
+        dist.destroy_process_group()
+
+
+    def run_fsdp_checkpoint_save_example(rank, world_size):
+        print(f"Running basic FSDP checkpoint saving example on rank {rank}.")
+        setup(rank, world_size)
+
+        # create a model and move it to GPU with id rank
+        model = ToyModel().to(rank)
+        model = fully_shard(model)
+
+        loss_fn = nn.MSELoss()
+        optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
+
+        # The storage writer defines our 'staging' strategy, where staging is considered the process of copying
+        # checkpoints to in-memory buffers. By setting `cached_state_dict=True`, we enable efficient memory copying
+        # into a persistent buffer with pinned memory enabled.
+        # Note: It's important that the writer persists in between checkpointing requests, since it maintains the
+        # pinned memory buffer.
+        writer = StorageWriter(cache_staged_state_dict=True, path=CHECKPOINT_DIR)
+        checkpoint_future = None
+        for step in range(10):
+            optimizer.zero_grad()
+            model(torch.rand(8, 16, device="cuda")).sum().backward()
+            optimizer.step()
+
+            state_dict = { "app": AppState(model, optimizer) }
+            if checkpoint_future is not None:
+                # waits for checkpointing to finish, avoiding queuing more then one checkpoint request at a time
+                checkpoint_future.result()
+            checkpoint_future = dcp.async_save(state_dict, storage_writer=writer, checkpoint_id=f"{CHECKPOINT_DIR}_step{step}")
+
+        cleanup()
+
+
+    if __name__ == "__main__":
+        world_size = torch.cuda.device_count()
+        print(f"Running fsdp checkpoint example on {world_size} devices.")
+        mp.spawn(
+            run_fsdp_checkpoint_save_example,
+            args=(world_size,),
+            nprocs=world_size,
+            join=True,
+        )
+
+
+Conclusion
+----------
+In conclusion, we have learned how to use DCP's :func:`async_save` API to generate checkpoints off the critical training path. We've also learned about the
+additional memory and concurrency overhead introduced by using this API, as well as additional optimizations which utilize pinned memory to speed things up
+even further.
+
+-  `Saving and loading models tutorial <https://pytorch.org/tutorials/beginner/saving_loading_models.html>`__
+-  `Getting started with FullyShardedDataParallel tutorial <https://pytorch.org/tutorials/intermediate/FSDP_tutorial.html>`__
diff --git a/recipes_source/distributed_checkpoint_recipe.rst b/recipes_source/distributed_checkpoint_recipe.rst
new file mode 100644
index 00000000000..de31d430402
--- /dev/null
+++ b/recipes_source/distributed_checkpoint_recipe.rst
@@ -0,0 +1,367 @@
+Getting Started with Distributed Checkpoint (DCP)
+=====================================================
+
+**Author**: `Iris Zhang <https://github.com/wz337>`__, `Rodrigo Kumpera <https://github.com/kumpera>`__, `Chien-Chin Huang <https://github.com/fegin>`__, `Lucas Pasqualin <https://github.com/lucasllc>`__
+
+.. note::
+   |edit| View and edit this tutorial in `github <https://github.com/pytorch/tutorials/blob/main/recipes_source/distributed_checkpoint_recipe.rst>`__.
+
+
+Prerequisites:
+
+-  `FullyShardedDataParallel API documents <https://pytorch.org/docs/master/fsdp.html>`__
+-  `torch.load API documents <https://pytorch.org/docs/stable/generated/torch.load.html>`__
+
+
+Checkpointing AI models during distributed training could be challenging, as parameters and gradients are partitioned across trainers and the number of trainers available could change when you resume training.
+Pytorch Distributed Checkpointing (DCP) can help make this process easier.
+
+In this tutorial, we show how to use DCP APIs with a simple FSDP wrapped model.
+
+
+How DCP works
+--------------
+
+:func:`torch.distributed.checkpoint` enables saving and loading models from multiple ranks in parallel. You can use this module to save on any number of ranks in parallel,
+and then re-shard across differing cluster topologies at load time.
+
+Addditionally, through the use of modules in :func:`torch.distributed.checkpoint.state_dict`,
+DCP offers support for gracefully handling ``state_dict`` generation and loading in distributed settings.
+This includes managing fully-qualified-name (FQN) mappings across models and optimizers, and setting default parameters for PyTorch provided parallelisms.
+
+DCP is different from :func:`torch.save` and :func:`torch.load` in a few significant ways:
+
+* It produces multiple files per checkpoint, with at least one per rank.
+* It operates in place, meaning that the model should allocate its data first and DCP uses that storage instead.
+* DCP offers special handling of Stateful objects (formally defined in `torch.distributed.checkpoint.stateful`), automatically calling both `state_dict` and `load_state_dict` methods if they are defined.
+
+.. note::
+  The code in this tutorial runs on an 8-GPU server, but it can be easily
+  generalized to other environments.
+
+How to use DCP
+--------------
+
+Here we use a toy model wrapped with FSDP for demonstration purposes. Similarly, the APIs and logic can be applied to larger models for checkpointing.
+
+Saving
+~~~~~~
+
+Now, let's create a toy module, wrap it with FSDP, feed it with some dummy input data, and save it.
+
+.. code-block:: python
+
+    import os
+
+    import torch
+    import torch.distributed as dist
+    import torch.distributed.checkpoint as dcp
+    import torch.multiprocessing as mp
+    import torch.nn as nn
+
+    from torch.distributed.fsdp import fully_shard
+    from torch.distributed.checkpoint.state_dict import get_state_dict, set_state_dict
+    from torch.distributed.checkpoint.stateful import Stateful
+
+    CHECKPOINT_DIR = "checkpoint"
+
+
+    class AppState(Stateful):
+        """This is a useful wrapper for checkpointing the Application State. Since this object is compliant
+        with the Stateful protocol, DCP will automatically call state_dict/load_stat_dict as needed in the
+        dcp.save/load APIs.
+
+        Note: We take advantage of this wrapper to hande calling distributed state dict methods on the model
+        and optimizer.
+        """
+
+        def __init__(self, model, optimizer=None):
+            self.model = model
+            self.optimizer = optimizer
+
+        def state_dict(self):
+            # this line automatically manages FSDP FQN's, as well as sets the default state dict type to FSDP.SHARDED_STATE_DICT
+            model_state_dict, optimizer_state_dict = get_state_dict(self.model, self.optimizer)
+            return {
+                "model": model_state_dict,
+                "optim": optimizer_state_dict
+            }
+
+        def load_state_dict(self, state_dict):
+            # sets our state dicts on the model and optimizer, now that we've loaded
+            set_state_dict(
+                self.model,
+                self.optimizer,
+                model_state_dict=state_dict["model"],
+                optim_state_dict=state_dict["optim"]
+            )
+
+    class ToyModel(nn.Module):
+        def __init__(self):
+            super(ToyModel, self).__init__()
+            self.net1 = nn.Linear(16, 16)
+            self.relu = nn.ReLU()
+            self.net2 = nn.Linear(16, 8)
+
+        def forward(self, x):
+            return self.net2(self.relu(self.net1(x)))
+
+
+    def setup(rank, world_size):
+        os.environ["MASTER_ADDR"] = "localhost"
+        os.environ["MASTER_PORT"] = "12355 "
+
+        # initialize the process group
+        dist.init_process_group("nccl", rank=rank, world_size=world_size)
+        torch.cuda.set_device(rank)
+
+
+    def cleanup():
+        dist.destroy_process_group()
+
+
+    def run_fsdp_checkpoint_save_example(rank, world_size):
+        print(f"Running basic FSDP checkpoint saving example on rank {rank}.")
+        setup(rank, world_size)
+
+        # create a model and move it to GPU with id rank
+        model = ToyModel().to(rank)
+        model = fully_shard(model)
+
+        loss_fn = nn.MSELoss()
+        optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
+
+        optimizer.zero_grad()
+        model(torch.rand(8, 16, device="cuda")).sum().backward()
+        optimizer.step()
+
+        state_dict = { "app": AppState(model, optimizer) }
+        dcp.save(state_dict, checkpoint_id=CHECKPOINT_DIR)
+
+        cleanup()
+
+
+    if __name__ == "__main__":
+        world_size = torch.cuda.device_count()
+        print(f"Running fsdp checkpoint example on {world_size} devices.")
+        mp.spawn(
+            run_fsdp_checkpoint_save_example,
+            args=(world_size,),
+            nprocs=world_size,
+            join=True,
+        )
+
+Please go ahead and check the `checkpoint` directory. You should see checkpoint files corresponding to the number of files as shown below. For example, if you have 8 devices, you should see 8 files.
+
+.. figure:: /_static/img/distributed/distributed_checkpoint_generated_files.png
+   :width: 100%
+   :align: center
+   :alt: Distributed Checkpoint
+
+Loading
+~~~~~~~
+
+After saving, let’s create the same FSDP-wrapped model, and load the saved state dict from storage into the model. You can load in the same world size or different world size.
+
+Please note that you will have to call :func:`model.state_dict` prior to loading and pass it to DCP's :func:`load_state_dict` API.
+This is fundamentally different from :func:`torch.load`, as :func:`torch.load` simply requires the path to the checkpoint prior for loading.
+The reason that we need the ``state_dict`` prior to loading is:
+
+* DCP uses the pre-allocated storage from model state_dict to load from the checkpoint directory. During loading, the state_dict passed in will be updated in place.
+* DCP requires the sharding information from the model prior to loading to support resharding.
+
+.. code-block:: python
+
+    import os
+
+    import torch
+    import torch.distributed as dist
+    import torch.distributed.checkpoint as dcp
+    from torch.distributed.checkpoint.stateful import Stateful
+    from torch.distributed.checkpoint.state_dict import get_state_dict, set_state_dict
+    import torch.multiprocessing as mp
+    import torch.nn as nn
+
+    from torch.distributed.fsdp import fully_shard
+
+    CHECKPOINT_DIR = "checkpoint"
+
+
+    class AppState(Stateful):
+        """This is a useful wrapper for checkpointing the Application State. Since this object is compliant
+        with the Stateful protocol, DCP will automatically call state_dict/load_stat_dict as needed in the
+        dcp.save/load APIs.
+
+        Note: We take advantage of this wrapper to hande calling distributed state dict methods on the model
+        and optimizer.
+        """
+
+        def __init__(self, model, optimizer=None):
+            self.model = model
+            self.optimizer = optimizer
+
+        def state_dict(self):
+            # this line automatically manages FSDP FQN's, as well as sets the default state dict type to FSDP.SHARDED_STATE_DICT
+            model_state_dict, optimizer_state_dict = get_state_dict(self.model, self.optimizer)
+            return {
+                "model": model_state_dict,
+                "optim": optimizer_state_dict
+            }
+
+        def load_state_dict(self, state_dict):
+            # sets our state dicts on the model and optimizer, now that we've loaded
+            set_state_dict(
+                self.model,
+                self.optimizer,
+                model_state_dict=state_dict["model"],
+                optim_state_dict=state_dict["optim"]
+            )
+
+    class ToyModel(nn.Module):
+        def __init__(self):
+            super(ToyModel, self).__init__()
+            self.net1 = nn.Linear(16, 16)
+            self.relu = nn.ReLU()
+            self.net2 = nn.Linear(16, 8)
+
+        def forward(self, x):
+            return self.net2(self.relu(self.net1(x)))
+
+
+    def setup(rank, world_size):
+        os.environ["MASTER_ADDR"] = "localhost"
+        os.environ["MASTER_PORT"] = "12355 "
+
+        # initialize the process group
+        dist.init_process_group("nccl", rank=rank, world_size=world_size)
+        torch.cuda.set_device(rank)
+
+
+    def cleanup():
+        dist.destroy_process_group()
+
+
+    def run_fsdp_checkpoint_load_example(rank, world_size):
+        print(f"Running basic FSDP checkpoint loading example on rank {rank}.")
+        setup(rank, world_size)
+
+        # create a model and move it to GPU with id rank
+        model = ToyModel().to(rank)
+        model = fully_shard(model)
+
+        optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
+
+        state_dict = { "app": AppState(model, optimizer)}
+        dcp.load(
+            state_dict=state_dict,
+            checkpoint_id=CHECKPOINT_DIR,
+        )
+
+        cleanup()
+
+
+    if __name__ == "__main__":
+        world_size = torch.cuda.device_count()
+        print(f"Running fsdp checkpoint example on {world_size} devices.")
+        mp.spawn(
+            run_fsdp_checkpoint_load_example,
+            args=(world_size,),
+            nprocs=world_size,
+            join=True,
+        )
+
+If you would like to load the saved checkpoint into a non-FSDP wrapped model in a non-distributed setup, perhaps for inference, you can also do that with DCP.
+By default, DCP saves and loads a distributed ``state_dict`` in Single Program Multiple Data(SPMD) style. However if no process group is initialized, DCP infers
+the intent is to save or load in "non-distributed" style, meaning entirely in the current process.
+
+.. note::
+  Distributed checkpoint support for Multi-Program Multi-Data is still under development.
+
+.. code-block:: python
+
+    import os
+
+    import torch
+    import torch.distributed.checkpoint as dcp
+    import torch.nn as nn
+
+
+    CHECKPOINT_DIR = "checkpoint"
+
+
+    class ToyModel(nn.Module):
+        def __init__(self):
+            super(ToyModel, self).__init__()
+            self.net1 = nn.Linear(16, 16)
+            self.relu = nn.ReLU()
+            self.net2 = nn.Linear(16, 8)
+
+        def forward(self, x):
+            return self.net2(self.relu(self.net1(x)))
+
+
+    def run_checkpoint_load_example():
+        # create the non FSDP-wrapped toy model
+        model = ToyModel()
+        state_dict = {
+            "model": model.state_dict(),
+        }
+
+        # since no progress group is initialized, DCP will disable any collectives.
+        dcp.load(
+            state_dict=state_dict,
+            checkpoint_id=CHECKPOINT_DIR,
+        )
+        model.load_state_dict(state_dict["model"])
+
+    if __name__ == "__main__":
+        print(f"Running basic DCP checkpoint loading example.")
+        run_checkpoint_load_example()
+
+
+Formats
+----------
+One drawback not yet mentioned is that DCP saves checkpoints in a format which is inherently different then those generated using torch.save.
+Since this can be an issue when users wish to share models with users used to the torch.save format, or in general just want to add format flexibility
+to their applications. For this case, we provide the ``format_utils`` module in ``torch.distributed.checkpoint.format_utils``.
+
+A command line utility is provided for the users convenience, which follows the following format:
+
+.. code-block:: bash
+
+    python -m torch.distributed.checkpoint.format_utils <mode> <checkpoint location> <location to write formats to>
+
+In the command above, ``mode`` is one of ``torch_to_dcp`` or ``dcp_to_torch``.
+
+
+Alternatively, methods are also provided for users who may wish to convert checkpoints directly.
+
+.. code-block:: python
+
+    import os
+
+    import torch
+    import torch.distributed.checkpoint as DCP
+    from torch.distributed.checkpoint.format_utils import dcp_to_torch_save, torch_save_to_dcp
+
+    CHECKPOINT_DIR = "checkpoint"
+    TORCH_SAVE_CHECKPOINT_DIR = "torch_save_checkpoint.pth"
+
+    # convert dcp model to torch.save (assumes checkpoint was generated as above)
+    dcp_to_torch_save(CHECKPOINT_DIR, TORCH_SAVE_CHECKPOINT_DIR)
+
+    # converts the torch.save model back to DCP
+    torch_save_to_dcp(TORCH_SAVE_CHECKPOINT_DIR, f"{CHECKPOINT_DIR}_new")
+
+
+
+Conclusion
+----------
+In conclusion, we have learned how to use DCP's :func:`save` and :func:`load` APIs, as well as how they are different form :func:`torch.save` and :func:`torch.load`.
+Additionally, we've learned how to use :func:`get_state_dict` and :func:`set_state_dict` to automatically manage parallelism-specific FQN's and defaults during state dict
+generation and loading.
+
+For more information, please see the following:
+
+-  `Saving and loading models tutorial <https://pytorch.org/tutorials/beginner/saving_loading_models.html>`__
+-  `Getting started with FullyShardedDataParallel tutorial <https://pytorch.org/tutorials/intermediate/FSDP_tutorial.html>`__
diff --git a/recipes_source/distributed_comm_debug_mode.rst b/recipes_source/distributed_comm_debug_mode.rst
new file mode 100644
index 00000000000..dc1a6e3e565
--- /dev/null
+++ b/recipes_source/distributed_comm_debug_mode.rst
@@ -0,0 +1,210 @@
+Getting Started with ``CommDebugMode``
+=====================================================
+
+**Author**: `Anshul Sinha <https://github.com/sinhaanshul>`__
+
+
+In this tutorial, we will explore how to use ``CommDebugMode`` with PyTorch's
+DistributedTensor (DTensor) for debugging by tracking collective operations in distributed training environments.
+
+Prerequisites
+---------------------
+
+* Python 3.8 - 3.11
+* PyTorch 2.2 or later
+
+
+What is ``CommDebugMode`` and why is it useful
+----------------------------------------------------
+As the size of models continues to increase, users are seeking to leverage various combinations
+of parallel strategies to scale up distributed training. However, the lack of interoperability
+between existing solutions poses a significant challenge, primarily due to the absence of a
+unified abstraction that can bridge these different parallelism strategies. To address this
+issue, PyTorch has proposed `DistributedTensor(DTensor)
+<https://github.com/pytorch/pytorch/blob/main/torch/distributed/tensor/examples/comm_mode_features_example.py>`_
+which abstracts away the complexities of tensor communication in distributed training,
+providing a seamless user experience. However, when dealing with existing parallelism solutions and
+developing parallelism solutions using the unified abstraction like DTensor, the lack of transparency
+about what and when the collective communications happens under the hood could make it challenging
+for advanced users to identify and resolve issues. To address this challenge, ``CommDebugMode``, a
+Python context manager will serve as one of the primary debugging tools for DTensors, enabling
+users to view when and why collective operations are happening when using DTensors, effectively
+addressing this issue.
+
+
+Using ``CommDebugMode``
+------------------------
+
+Here is how you can use ``CommDebugMode``:
+
+.. code-block:: python
+
+    # The model used in this example is a MLPModule applying Tensor Parallel
+    comm_mode = CommDebugMode()
+        with comm_mode:
+            output = model(inp)
+
+    # print the operation level collective tracing information
+    print(comm_mode.generate_comm_debug_tracing_table(noise_level=0))
+
+    # log the operation level collective tracing information to a file
+    comm_mode.log_comm_debug_tracing_table_to_file(
+        noise_level=1, file_name="transformer_operation_log.txt"
+    )
+
+    # dump the operation level collective tracing information to json file,
+    # used in the visual browser below
+    comm_mode.generate_json_dump(noise_level=2)
+
+This is what the output looks like for a MLPModule at noise level 0:
+
+.. code-block:: python
+
+    Expected Output:
+        Global
+          FORWARD PASS
+            *c10d_functional.all_reduce: 1
+            MLPModule
+              FORWARD PASS
+                *c10d_functional.all_reduce: 1
+                MLPModule.net1
+                MLPModule.relu
+                MLPModule.net2
+                  FORWARD PASS
+                    *c10d_functional.all_reduce: 1
+
+To use ``CommDebugMode``, you must wrap the code running the model in ``CommDebugMode`` and call the API that
+you want to use to display the data. You can also use a ``noise_level`` argument to control the verbosity
+level of displayed information. Here is what each noise level displays:
+
+| 0. Prints module-level collective counts
+| 1. Prints DTensor operations (not including trivial operations), module sharding information
+| 2. Prints tensor operations (not including trivial operations)
+| 3. Prints all operations
+
+In the example above, you can see that the collective operation, all_reduce, occurs once in the forward pass
+of the ``MLPModule``. Furthermore, you can use ``CommDebugMode`` to pinpoint that the all-reduce operation happens
+in the second linear layer of the ``MLPModule``.
+
+
+Below is the interactive module tree visualization that you can use to upload your own JSON dump:
+
+.. raw:: html
+
+    <!DOCTYPE html>
+    <html lang ="en">
+    <head>
+        <meta charset="UTF-8">
+        <meta name = "viewport" content="width=device-width, initial-scale=1.0">
+        <title>CommDebugMode Module Tree</title>
+        <style>
+            ul, #tree-container {
+                list-style-type: none;
+                margin: 0;
+                padding: 0;
+            }
+            .caret {
+                cursor: pointer;
+                user-select: none;
+            }
+            .caret::before {
+                content: "\25B6";
+                color:black;
+                display: inline-block;
+                margin-right: 6px;
+            }
+            .caret-down::before {
+                transform: rotate(90deg);
+            }
+            .tree {
+                padding-left: 20px;
+            }
+            .tree ul {
+                padding-left: 20px;
+            }
+            .nested {
+                display: none;
+            }
+            .active {
+                display: block;
+            }
+            .forward-pass,
+            .backward-pass {
+                margin-left: 40px;
+            }
+            .forward-pass table {
+                margin-left: 40px;
+                width: auto;
+            }
+            .forward-pass table td, .forward-pass table th {
+                padding: 8px;
+            }
+            .forward-pass ul {
+                display: none;
+            }
+            table {
+                font-family: arial, sans-serif;
+                border-collapse: collapse;
+                width: 100%;
+            }
+            td, th {
+                border: 1px solid #dddddd;
+                text-align: left;
+                padding: 8px;
+            }
+            tr:nth-child(even) {
+                background-color: #dddddd;
+            }
+            #drop-area {
+                position: relative;
+                width: 25%;
+                height: 100px;
+                border: 2px dashed #ccc;
+                border-radius: 5px;
+                padding: 0px;
+                text-align: center;
+            }
+            .drag-drop-block {
+                display: inline-block;
+                width: 200px;
+                height: 50px;
+                background-color: #f7f7f7;
+                border: 1px solid #ccc;
+                border-radius: 5px;
+                padding: 10px;
+                font-size: 14px;
+                color: #666;
+                cursor: pointer;
+            }
+            #file-input {
+                position: absolute;
+                top: 0;
+                left: 0;
+                width: 100%;
+                height: 100%;
+                opacity: 0;
+            }
+        </style>
+    </head>
+    <body>
+        <div id="drop-area">
+            <div class="drag-drop-block">
+              <span>Drag file here</span>
+            </div>
+            <input type="file" id="file-input" accept=".json">
+          </div>
+        <div id="tree-container"></div>
+        <script src="https://cdn.jsdelivr.net/gh/pytorch/pytorch@main/torch/distributed/tensor/debug/comm_mode_broswer_visual.js"></script>
+    </body>
+    </html>
+
+Conclusion
+------------------------------------------
+
+In this recipe, we have learned how to use ``CommDebugMode`` to debug Distributed Tensors and
+parallelism solutions that uses communication collectives with PyTorch. You can use your own
+JSON outputs in the embedded visual browser.
+
+For more detailed information about ``CommDebugMode``, see
+`comm_mode_features_example.py
+<https://github.com/pytorch/pytorch/blob/main/torch/distributed/tensor/examples/comm_mode_features_example.py>`_
diff --git a/recipes_source/distributed_device_mesh.rst b/recipes_source/distributed_device_mesh.rst
new file mode 100644
index 00000000000..3a04b8de4bf
--- /dev/null
+++ b/recipes_source/distributed_device_mesh.rst
@@ -0,0 +1,179 @@
+Getting Started with DeviceMesh
+=====================================================
+
+**Author**: `Iris Zhang <https://github.com/wz337>`__, `Wanchao Liang <https://github.com/wanchaol>`__
+
+.. note::
+   |edit| View and edit this tutorial in `github <https://github.com/pytorch/tutorials/blob/main/recipes_source/distributed_device_mesh.rst>`__.
+
+Prerequisites:
+
+- `Distributed Communication Package - torch.distributed <https://pytorch.org/docs/stable/distributed.html>`__
+- Python 3.8 - 3.11
+- PyTorch 2.2
+
+
+Setting up distributed communicators, i.e. NVIDIA Collective Communication Library (NCCL) communicators, for distributed training can pose a significant challenge. For workloads where users need to compose different parallelisms,
+users would need to manually set up and manage NCCL communicators (for example, :class:`ProcessGroup`) for each parallelism solution. This process could be complicated and susceptible to errors.
+:class:`DeviceMesh` can simplify this process, making it more manageable and less prone to errors.
+
+What is DeviceMesh
+------------------
+:class:`DeviceMesh` is a higher level abstraction that manages :class:`ProcessGroup`. It allows users to effortlessly
+create inter-node and intra-node process groups without worrying about how to set up ranks correctly for different sub process groups.
+Users can also easily manage the underlying process_groups/devices for multi-dimensional parallelism via :class:`DeviceMesh`.
+
+.. figure:: /_static/img/distributed/device_mesh.png
+   :width: 100%
+   :align: center
+   :alt: PyTorch DeviceMesh
+
+Why DeviceMesh is Useful
+------------------------
+DeviceMesh is useful when working with multi-dimensional parallelism (i.e. 3-D parallel) where parallelism composability is required. For example, when your parallelism solutions require both communication across hosts and within each host.
+The image above shows that we can create a 2D mesh that connects the devices within each host, and connects each device with its counterpart on the other hosts in a homogeneous setup.
+
+Without DeviceMesh, users would need to manually set up NCCL communicators, cuda devices on each process before applying any parallelism, which could be quite complicated.
+The following code snippet illustrates a hybrid sharding 2-D Parallel pattern setup without :class:`DeviceMesh`.
+First, we need to manually calculate the shard group and replicate group. Then, we need to assign the correct shard and
+replicate group to each rank.
+
+.. code-block:: python
+
+    import os
+
+    import torch
+    import torch.distributed as dist
+
+    # Understand world topology
+    rank = int(os.environ["RANK"])
+    world_size = int(os.environ["WORLD_SIZE"])
+    print(f"Running example on {rank=} in a world with {world_size=}")
+
+    # Create process groups to manage 2-D like parallel pattern
+    dist.init_process_group("nccl")
+    torch.cuda.set_device(rank)
+
+    # Create shard groups (e.g. (0, 1, 2, 3), (4, 5, 6, 7))
+    # and assign the correct shard group to each rank
+    num_node_devices = torch.cuda.device_count()
+    shard_rank_lists = list(range(0, num_node_devices // 2)), list(range(num_node_devices // 2, num_node_devices))
+    shard_groups = (
+        dist.new_group(shard_rank_lists[0]),
+        dist.new_group(shard_rank_lists[1]),
+    )
+    current_shard_group = (
+        shard_groups[0] if rank in shard_rank_lists[0] else shard_groups[1]
+    )
+
+    # Create replicate groups (for example, (0, 4), (1, 5), (2, 6), (3, 7))
+    # and assign the correct replicate group to each rank
+    current_replicate_group = None
+    shard_factor = len(shard_rank_lists[0])
+    for i in range(num_node_devices // 2):
+        replicate_group_ranks = list(range(i, num_node_devices, shard_factor))
+        replicate_group = dist.new_group(replicate_group_ranks)
+        if rank in replicate_group_ranks:
+            current_replicate_group = replicate_group
+
+To run the above code snippet, we can leverage PyTorch Elastic. Let's create a file named ``2d_setup.py``.
+Then, run the following `torch elastic/torchrun <https://pytorch.org/docs/stable/elastic/quickstart.html>`__ command.
+
+.. code-block:: python
+
+    torchrun --nproc_per_node=8 --rdzv_id=100 --rdzv_endpoint=localhost:29400 2d_setup.py
+
+.. note::
+    For simplicity of demonstration, we are simulating 2D parallel using only one node. Note that this code snippet can also be used when running on multi hosts setup.
+
+With the help of :func:`init_device_mesh`, we can accomplish the above 2D setup in just two lines, and we can still
+access the underlying :class:`ProcessGroup` if needed.
+
+
+.. code-block:: python
+
+    from torch.distributed.device_mesh import init_device_mesh
+    mesh_2d = init_device_mesh("cuda", (2, 4), mesh_dim_names=("replicate", "shard"))
+
+    # Users can access the underlying process group thru `get_group` API.
+    replicate_group = mesh_2d.get_group(mesh_dim="replicate")
+    shard_group = mesh_2d.get_group(mesh_dim="shard")
+
+Let's create a file named ``2d_setup_with_device_mesh.py``.
+Then, run the following `torch elastic/torchrun <https://pytorch.org/docs/stable/elastic/quickstart.html>`__ command.
+
+.. code-block:: python
+
+    torchrun --nproc_per_node=8 2d_setup_with_device_mesh.py
+
+
+How to use DeviceMesh with HSDP
+-------------------------------
+
+Hybrid Sharding Data Parallel(HSDP) is 2D strategy to perform FSDP within a host and DDP across hosts.
+
+Let's see an example of how DeviceMesh can assist with applying HSDP to your model with a simple setup. With DeviceMesh,
+users would not need to manually create and manage shard group and replicate group.
+
+.. code-block:: python
+
+    import torch
+    import torch.nn as nn
+
+    from torch.distributed.device_mesh import init_device_mesh
+    from torch.distributed.fsdp import fully_shard as FSDP
+
+
+    class ToyModel(nn.Module):
+        def __init__(self):
+            super(ToyModel, self).__init__()
+            self.net1 = nn.Linear(10, 10)
+            self.relu = nn.ReLU()
+            self.net2 = nn.Linear(10, 5)
+
+        def forward(self, x):
+            return self.net2(self.relu(self.net1(x)))
+
+
+    # HSDP: MeshShape(2, 4)
+    mesh_2d = init_device_mesh("cuda", (2, 4), mesh_dim_names=("dp_replicate", "dp_shard"))
+    model = FSDP(
+        ToyModel(), device_mesh=mesh_2d
+    )
+
+Let's create a file named ``hsdp.py``.
+Then, run the following `torch elastic/torchrun <https://pytorch.org/docs/stable/elastic/quickstart.html>`__ command.
+
+.. code-block:: python
+
+    torchrun --nproc_per_node=8 hsdp.py
+
+How to use DeviceMesh for your custom parallel solutions
+--------------------------------------------------------
+When working with large scale training, you might have more complex custom parallel training composition. For example, you may need to slice out sub-meshes for different parallelism solutions.
+DeviceMesh allows users to slice child mesh from the parent mesh and re-use the NCCL communicators already created when the parent mesh is initialized.
+
+.. code-block:: python
+
+    from torch.distributed.device_mesh import init_device_mesh
+    mesh_3d = init_device_mesh("cuda", (2, 2, 2), mesh_dim_names=("replicate", "shard", "tp"))
+
+    # Users can slice child meshes from the parent mesh.
+    hsdp_mesh = mesh_3d["replicate", "shard"]
+    tp_mesh = mesh_3d["tp"]
+
+    # Users can access the underlying process group thru `get_group` API.
+    replicate_group = hsdp_mesh["replicate"].get_group()
+    shard_group = hsdp_mesh["shard"].get_group()
+    tp_group = tp_mesh.get_group()
+
+
+Conclusion
+----------
+In conclusion, we have learned about :class:`DeviceMesh` and :func:`init_device_mesh`, as well as how
+they can be used to describe the layout of devices across the cluster.
+
+For more information, please see the following:
+
+- `2D parallel combining Tensor/Sequence Parallel with FSDP <https://github.com/pytorch/examples/blob/main/distributed/tensor_parallelism/fsdp_tp_example.py>`__
+- `Composable PyTorch Distributed with PT2 <https://static.sched.com/hosted_files/pytorch2023/d1/%5BPTC%2023%5D%20Composable%20PyTorch%20Distributed%20with%20PT2.pdf>`__
diff --git a/recipes_source/distributed_optim_torchscript.rst b/recipes_source/distributed_optim_torchscript.rst
new file mode 100644
index 00000000000..01bc497d38e
--- /dev/null
+++ b/recipes_source/distributed_optim_torchscript.rst
@@ -0,0 +1,6 @@
+..
+  TODO(gmagogsfm): Replace/delete this document by 2.9 release. https://github.com/pytorch/tutorials/issues/3456
+
+.. warning::
+    TorchScript is deprecated, please use 
+    `torch.export <https://docs.pytorch.org/tutorials/intermediate/torch_export_tutorial.html>`__ instead.
\ No newline at end of file
diff --git a/recipes_source/distributed_rpc_profiling.rst b/recipes_source/distributed_rpc_profiling.rst
new file mode 100644
index 00000000000..a98ef2d5194
--- /dev/null
+++ b/recipes_source/distributed_rpc_profiling.rst
@@ -0,0 +1,10 @@
+Profiling PyTorch RPC-Based Workloads
+======================================
+
+This tutorial has been deprecated.
+
+Redirecting to home page.
+
+.. raw:: html
+
+   <meta http-equiv="Refresh" content="2; url='https://pytorch.org/tutorials'" />
diff --git a/recipes_source/foreach_map.py b/recipes_source/foreach_map.py
new file mode 100644
index 00000000000..0225a77e279
--- /dev/null
+++ b/recipes_source/foreach_map.py
@@ -0,0 +1,204 @@
+"""
+Explicit horizontal fusion with foreach_map and torch.compile
+===============================================================
+
+**Author:** `Michael Lazos <https://github.com/mlazos>`_
+"""
+
+#########################################################
+#  Horizontal fusion is a key optimization in ML compilers. In eager,
+#  this is typically expressed using the torch._foreach* ops which parallelizes
+#  operations across a list of tensors. However, supporting all possible permutations
+#  of arguments is quite difficult (e.g. mixtures of scalars and lists). Foreach_map
+#  allows conversion of any pointwise op in ``torch`` to a horiztonally fused foreach
+#  variant. In this tutorial, we will demonstrate how to implement the Adam optimizer
+#  with ``foreach_map`` to generate a fully fused kernel.  
+#
+# .. note::
+#
+#    This recipe describes a prototype feature. Prototype features are typically
+#    at an early stage for feedback and testing and are subject to change.
+#
+# Prerequisites
+# -------------
+#
+# * PyTorch v2.7.0 or later
+#
+
+#####################################################################
+# Model Setup
+# ~~~~~~~~~~~~~~~~~~~~~
+# For this example, we'll use a simple sequence of linear layers.
+# We instantiate an independent copy to compare the two optimizer implementations.
+#
+import torch
+
+# exit cleanly if we are on a device that doesn't support ``torch.compile``
+if torch.cuda.get_device_capability() < (7, 0):
+    print("Exiting because torch.compile is not supported on this device.")
+    import sys
+    sys.exit(0)
+
+# Create simple model
+model = torch.nn.Sequential(
+    *[torch.nn.Linear(1024, 1024, False, device="cuda") for _ in range(10)]
+)
+model_copy = torch.nn.Sequential(
+    *[torch.nn.Linear(1024, 1024, False, device="cuda") for _ in range(10)]
+)
+input = torch.rand(1024, device="cuda")
+
+# run forward pass
+output = model(input)
+output_copy = model_copy(input)
+
+# run backward to populate the grads for our optimizer below
+output.sum().backward()
+output_copy.sum().backward()
+
+#####################################################################
+# Helper functions for foreach_map implementation
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# In this section, we'll begin our implementation of the Adam optimizer.
+#
+from torch._higher_order_ops.foreach_map import foreach_map
+
+# Helper function to extract optimizer states from a torch.optim.Adam instance
+def get_inputs(optim):
+    steps = []
+    params = []
+    grads = []
+    exp_avgs = []
+    exp_avg_sqs = []
+    for group in optim.param_groups:
+        for p in group["params"]:
+            params.append(p)
+            grads.append(p.grad)
+            state = optim.state[p]
+            exp_avgs.append(state["exp_avg"])
+            exp_avg_sqs.append(state["exp_avg_sq"])
+            steps.append(state["step"])
+
+    return steps, params, exp_avgs, exp_avg_sqs
+
+
+# Functions to update the different optimizer states
+def update_exp_avg_sq(exp_avg_sq, grad, beta2):
+    return exp_avg_sq.mul(beta2).addcmul(grad, grad, value=1 - beta2)
+
+def update_param(param, step, exp_avg, exp_avg_sq, beta1, beta2, lr, eps):
+    bias_correction1 = 1 - torch.pow(beta1, step)
+    bias_correction2 = (1 - torch.pow(beta2, step)).sqrt()
+    step_size = (lr / bias_correction1).neg()
+    denom = (exp_avg_sq.sqrt() / (bias_correction2 * step_size)).add(eps / step_size)
+    return torch.add(param, torch.div(exp_avg, denom))
+
+# Our full Adam implementation
+def foreach_map_adam(
+    steps,
+    params,
+    exp_avgs,
+    exp_avg_sqs,
+    weight_decay=0,
+    beta1=0.9,
+    beta2=0.999,
+    lr=1e-3,
+    eps=1e-8,
+):
+    with torch.no_grad():
+        grads = [param.grad for param in params]
+        # update step
+        updated_steps = foreach_map(lambda x: x + 1, steps)
+        torch._foreach_copy_(steps, updated_steps)
+
+        if weight_decay != 0:
+            foreach_map(torch.add, (grads,), alpha=weight_decay)
+
+        # Higher-order operators (HOPs) cannot have multiple outputs at the moment
+        # need to call foreach_map once for each output
+        exp_avgs_updated = foreach_map(torch.lerp, exp_avgs, grads, 1 - beta1)
+        exp_avgs_sq_updated = foreach_map(update_exp_avg_sq, exp_avg_sqs, grads, beta2)
+        params_updated = foreach_map(
+            update_param,
+            params,
+            steps,
+            exp_avgs_updated,
+            exp_avgs_sq_updated,
+            beta1,
+            beta2,
+            lr,
+            eps,
+        )
+        # Higher-order operators (HOPs) don't support input mutation today
+        # so manually  update the states in-place
+        torch._foreach_copy_(exp_avgs, exp_avgs_updated)
+        torch._foreach_copy_(exp_avg_sqs, exp_avgs_sq_updated)
+        torch._foreach_copy_(params, params_updated)
+    return
+
+#####################################################################
+# Setting up and running the compiled kernel
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# In this section, we'll run our Adam optimizer 
+# and compare the results
+#
+# .. note::
+#
+#    ``torch.compile`` is only supported on CUDA devices that have a compute capability of 7.0 or higher.
+opt_eager = torch.optim.Adam(model.parameters(), lr=torch.tensor(0.01))
+opt_eager_copy = torch.optim.Adam(model_copy.parameters(), lr=torch.tensor(0.01))
+
+# warm up the optimizer state dict
+opt_eager.step()
+opt_eager_copy.step()
+
+inputs = get_inputs(opt_eager_copy)
+compiled_adam = torch.compile(foreach_map_adam)
+
+# optionally view the output code
+torch._logging.set_logs(output_code=True)
+
+# Warmup runs to compile the function
+for _ in range(5):
+    opt_eager.step()
+    compiled_adam(*inputs)
+
+for eager_p, compile_p in zip(opt_eager.param_groups[0]["params"], opt_eager_copy.param_groups[0]["params"]):
+    torch.allclose(eager_p, compile_p)
+
+# Benchmark performance
+
+ # Let's define a helpful benchmarking function:
+import torch.utils.benchmark as benchmark
+
+def benchmark_torch_function_in_microseconds(f, *args, **kwargs):
+    t0 = benchmark.Timer(
+        stmt="f(*args, **kwargs)", globals={"args": args, "kwargs": kwargs, "f": f}
+    )
+    return t0.blocked_autorange().mean * 1e6
+
+eager_runtime = benchmark_torch_function_in_microseconds(opt_eager.step)
+compiled_runtime = benchmark_torch_function_in_microseconds(lambda: compiled_adam(*inputs))
+
+assert eager_runtime > compiled_runtime
+   
+print(f"eager runtime: {eager_runtime}us")
+print(f"compiled runtime: {compiled_runtime}us")
+
+
+
+######################################################################
+# Conclusion
+# ~~~~~~~~~~
+# In this tutorial, we successfully implemented a custom fully-fused Adam optimizer using foreach_map. 
+# By leveraging the power of foreach_map and torch.compile, we were able to create an optimized version of the Adam 
+# optimizer that can be used in various machine learning applications. This tutorial provides a comprehensive guide 
+# on how to use foreach_map and torch.compile to optimize machine learning models, and serves as a 
+# valuable resource for developers looking to improve the performance of their models with horizontal fusion.
+#
+# See also:
+#
+# * `Compiled optimizer tutorial <https://pytorch.org/tutorials/recipes/compiling_optimizer.html>`__ - an intro into the compiled optimizer.
+# * `Compiling the optimizer with PT2 <https://dev-discuss.pytorch.org/t/compiling-the-optimizer-with-pt2/1669>`__ - deeper technical details on the compiled optimizer. 
diff --git a/recipes_source/intel_neural_compressor_for_pytorch.rst b/recipes_source/intel_neural_compressor_for_pytorch.rst
new file mode 100755
index 00000000000..ee569382343
--- /dev/null
+++ b/recipes_source/intel_neural_compressor_for_pytorch.rst
@@ -0,0 +1,161 @@
+Ease-of-use quantization for PyTorch with Intel® Neural Compressor
+==================================================================
+
+Overview
+--------
+
+Most deep learning applications are using 32-bits of floating-point precision for inference. But low precision data types, such as fp8, are getting more focus due to significant performance boost. A key concern in adopting low precision is mitigating accuracy loss while meeting predefined requirements.
+
+Intel® Neural Compressor aims to address the aforementioned concern by extending PyTorch with accuracy-driven automatic tuning strategies to help user quickly find out the best quantized model on Intel hardware.
+
+Intel® Neural Compressor is an open-source project at `Github <https://github.com/intel/neural-compressor>`_.
+
+Features
+--------
+
+- **Ease-of-use API:** Intel® Neural Compressor is re-using the PyTorch ``prepare``, ``convert`` API for user usage.
+
+- **Accuracy-driven Tuning:** Intel® Neural Compressor supports accuracy-driven automatic tuning process, provides ``autotune`` API for user usage.
+
+- **Kinds of Quantization:** Intel® Neural Compressor supports a variety of quantization methods, including classic INT8 quantization, weight-only quantization and the popular FP8 quantization. Neural compressor also provides the latest research in simulation work, such as MX data type emulation quantization. For more details, please refer to `Supported Matrix <https://github.com/intel/neural-compressor/blob/master/docs/source/3x/PyTorch.md#supported-matrix>`_.
+
+Getting Started
+---------------
+
+Installation
+~~~~~~~~~~~~
+
+.. code:: bash
+
+    # install stable version from pip
+    pip install neural-compressor-pt
+..
+
+**Note**: Neural Compressor provides automatic accelerator detection, including HPU, Intel GPU, CUDA, and CPU. To specify the target device, ``INC_TARGET_DEVICE`` is suggested, e.g., ``export INC_TARGET_DEVICE=cpu``.
+
+
+Examples
+~~~~~~~~~~~~
+
+This section shows examples of kinds of quantization with Intel® Neural compressor
+
+FP8 Quantization
+^^^^^^^^^^^^^^^^
+
+**FP8 Quantization** is supported by Intel® Gaudi®2&3 AI Accelerator (HPU). To prepare the environment, please refer to `Intel® Gaudi® Documentation <https://docs.habana.ai/en/latest/index.html>`_.
+
+Run the example,
+
+.. code-block:: python
+
+    # FP8 Quantization Example
+    from neural_compressor.torch.quantization import (
+        FP8Config,
+        prepare,
+        convert,
+    )
+
+    import torch
+    import torchvision.models as models
+
+    # Load a pre-trained ResNet18 model
+    model = models.resnet18()
+
+    # Configure FP8 quantization
+    qconfig = FP8Config(fp8_config="E4M3")
+    model = prepare(model, qconfig)
+
+    # Perform calibration (replace with actual calibration data)
+    calibration_data = torch.randn(1, 3, 224, 224).to("hpu")
+    model(calibration_data)
+
+    # Convert the model to FP8
+    model = convert(model)
+
+    # Perform inference
+    input_data = torch.randn(1, 3, 224, 224).to("hpu")
+    output = model(input_data).to("cpu")
+    print(output)
+
+..
+
+Weight-only Quantization
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+**Weight-only Quantization** is also supported on Intel® Gaudi®2&3 AI Accelerator. The quantized model could be loaded as below.
+
+.. code-block:: python
+
+    from neural_compressor.torch.quantization import load
+
+    # The model name comes from HuggingFace Model Hub.
+    model_name = "TheBloke/Llama-2-7B-GPTQ"
+    model = load(
+        model_name_or_path=model_name,
+        format="huggingface",
+        device="hpu",
+        torch_dtype=torch.bfloat16,
+    )
+..
+
+**Note:** Intel Neural Compressor will convert the model format from auto-gptq to hpu format on the first load and save hpu_model.safetensors to the local cache directory for the next load. So it may take a while to load for the first time.
+
+Static Quantization with PT2E Backend
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The PT2E path uses ``torch.dynamo`` to capture the eager model into an FX graph model, and then inserts the observers and Q/QD pairs on it. Finally it uses the ``torch.compile`` to perform the pattern matching and replace the Q/DQ pairs with optimized quantized operators.
+
+There are four steps to perform W8A8 static quantization with PT2E backend: ``export``, ``prepare``, ``convert`` and ``compile``.
+
+.. code-block:: python
+
+   import torch
+   from neural_compressor.torch.export import export
+   from neural_compressor.torch.quantization import StaticQuantConfig, prepare, convert
+
+   # Prepare the float model and example inputs for export model
+   model = UserFloatModel()
+   example_inputs = ...
+
+   # Export eager model into FX graph model
+   exported_model = export(model=model, example_inputs=example_inputs)
+   # Quantize the model
+   quant_config = StaticQuantConfig()
+   prepared_model = prepare(exported_model, quant_config=quant_config)
+   # Calibrate
+   run_fn(prepared_model)
+   q_model = convert(prepared_model)
+   # Compile the quantized model and replace the Q/DQ pattern with Q-operator
+   from torch._inductor import config
+
+   config.freezing = True
+   opt_model = torch.compile(q_model)
+..
+
+Accuracy-driven Tuning
+^^^^^^^^^^^^^^^^^^^^^^
+
+To leverage accuracy-driven automatic tuning, a specified tuning space is necessary. The ``autotune`` iterates the tuning space and applies the configuration on given high-precision model then records and compares its evaluation result with the baseline. The tuning process stops when meeting the exit policy.
+
+
+.. code-block:: python
+
+   from neural_compressor.torch.quantization import RTNConfig, TuningConfig, autotune
+
+
+   def eval_fn(model) -> float:
+       return ...
+
+
+   tune_config = TuningConfig(
+       config_set=RTNConfig(use_sym=[False, True], group_size=[32, 128]),
+       tolerable_loss=0.2,
+       max_trials=10,
+   )
+   q_model = autotune(model, tune_config=tune_config, eval_fn=eval_fn)
+..
+
+Tutorials
+---------
+
+More detailed tutorials are available in the official Intel® Neural Compressor `doc <https://intel.github.io/neural-compressor/latest/docs/source/Welcome.html>`_.
diff --git a/recipes_source/loading_data_recipe.rst b/recipes_source/loading_data_recipe.rst
new file mode 100644
index 00000000000..6ecd54b928a
--- /dev/null
+++ b/recipes_source/loading_data_recipe.rst
@@ -0,0 +1,8 @@
+Loading data in PyTorch
+=======================
+
+The content is deprecated. See `Datasets & DataLoaders <https://pytorch.org/tutorials/beginner/basics/data_tutorial.html>`__ instead.
+
+.. raw:: html
+
+   <meta http-equiv="Refresh" content="1; url='https://pytorch.org/tutorials/beginner/basics/data_tutorial.html'" />
diff --git a/recipes_source/mobile_interpreter.rst b/recipes_source/mobile_interpreter.rst
new file mode 100644
index 00000000000..e6d2056e1a6
--- /dev/null
+++ b/recipes_source/mobile_interpreter.rst
@@ -0,0 +1,10 @@
+(beta) Efficient mobile interpreter in Android and iOS
+==================================================================
+
+PyTorch Mobile is no longer actively supported. Please check out `ExecuTorch <https://github.com/pytorch/executorch>`__.
+
+Redirecting in 3 seconds...
+
+.. raw:: html
+
+    <meta http-equiv="Refresh" content="3; url='https://pytorch.org/executorch/stable/index.html'" />
diff --git a/recipes_source/mobile_perf.rst b/recipes_source/mobile_perf.rst
new file mode 100644
index 00000000000..8835ddecc6d
--- /dev/null
+++ b/recipes_source/mobile_perf.rst
@@ -0,0 +1,10 @@
+Pytorch Mobile Performance Recipes
+==================================
+
+PyTorch Mobile is no longer actively supported. Please check out `ExecuTorch <https://github.com/pytorch/executorch>`__.
+
+Redirecting in 3 seconds...
+
+.. raw:: html
+
+    <meta http-equiv="Refresh" content="3; url='https://pytorch.org/executorch/stable/index.html'" />
diff --git a/recipes_source/model_preparation_android.rst b/recipes_source/model_preparation_android.rst
new file mode 100644
index 00000000000..22c0e17df31
--- /dev/null
+++ b/recipes_source/model_preparation_android.rst
@@ -0,0 +1,10 @@
+Model Preparation for Android Recipe
+=====================================
+
+PyTorch Mobile is no longer actively supported. Please check out `ExecuTorch <https://github.com/pytorch/executorch>`__.
+
+Redirecting in 3 seconds...
+
+.. raw:: html
+
+    <meta http-equiv="Refresh" content="3; url='https://pytorch.org/executorch/stable/index.html'" />
diff --git a/recipes_source/model_preparation_ios.rst b/recipes_source/model_preparation_ios.rst
new file mode 100644
index 00000000000..cbb4927eaeb
--- /dev/null
+++ b/recipes_source/model_preparation_ios.rst
@@ -0,0 +1,10 @@
+Model Preparation for iOS Recipe
+=====================================
+
+PyTorch Mobile is no longer actively supported. Please check out `ExecuTorch <https://github.com/pytorch/executorch>`__.
+
+Redirecting in 3 seconds...
+
+.. raw:: html
+
+    <meta http-equiv="Refresh" content="3; url='https://pytorch.org/executorch/stable/index.html'" />
diff --git a/recipes_source/profile_with_itt.rst b/recipes_source/profile_with_itt.rst
new file mode 100644
index 00000000000..566fd614f22
--- /dev/null
+++ b/recipes_source/profile_with_itt.rst
@@ -0,0 +1,190 @@
+Profiling PyTorch workloads with The Instrumentation and Tracing Technology (ITT) API
+=====================================================================================
+
+In this recipe, you will learn:
+
+* What is Intel® VTune™ Profiler
+* What is Instrumentation and Tracing Technology (ITT) API
+* How to visualize PyTorch model hierarchy in Intel® VTune™ Profiler
+* A short sample code showcasing how to use PyTorch ITT APIs
+
+
+Requirements
+------------
+
+* PyTorch 1.13 or later
+* Intel® VTune™ Profiler
+
+The instructions for installing PyTorch are available at `pytorch.org <https://pytorch.org/get-started/locally/>`__.
+
+
+What is Intel® VTune™ Profiler
+------------------------------
+
+Intel® VTune™ Profiler is a performance analysis tool for serial and multithreaded applications. For those who are familiar with Intel Architecture, Intel® VTune™ Profiler provides a rich set of metrics to help users understand how the application executed on Intel platforms, and thus have an idea where the performance bottleneck is.
+
+More detailed information, including a Getting Started guide, are available `on the Intel website <https://www.intel.com/content/www/us/en/developer/tools/oneapi/vtune-profiler.html>`__.
+
+What is Instrumentation and Tracing Technology (ITT) API
+--------------------------------------------------------
+
+`The Instrumentation and Tracing Technology API (ITT API) <https://www.intel.com/content/www/us/en/develop/documentation/vtune-help/top/api-support/instrumentation-and-tracing-technology-apis.html>`_ provided by the Intel® VTune™ Profiler enables target application to generate and control the collection of trace data during its execution.
+
+The advantage of ITT feature is to label time span of individual PyTorch operators, as well as customized regions, on Intel® VTune™ Profiler GUI. When users find anything abnormal, it will be very helpful to locate which operator behaved unexpectedly.
+
+.. note::
+
+   The ITT API had been integrated into PyTorch since 1.13. Users don't need to invoke the original ITT C/C++ APIs, but only need to invoke the Python APIs in PyTorch. More detailed information can be found at `PyTorch Docs <https://pytorch.org/docs/stable/profiler.html#intel-instrumentation-and-tracing-technology-apis>`__.
+
+How to visualize PyTorch model hierarchy in Intel® VTune™ Profiler
+------------------------------------------------------------------
+
+Two types of usage are provided in PyTorch:
+
+1. Implicit invocation: By default, all operators that are registered by following the PyTorch operator registration mechanism will be labeled by ITT feature automatically when its feature is enabled.
+
+2. Explicit invocation: If customized labeling is needed, users can use APIs mentioned at `PyTorch Docs <https://pytorch.org/docs/stable/profiler.html#intel-instrumentation-and-tracing-technology-apis>`__ explicitly to label a desired range.
+
+
+To enable explicit invocation, code which are expected to be labeled should be invoked under a `torch.autograd.profiler.emit_itt()` scope. For example:
+
+.. code:: python3
+
+   with torch.autograd.profiler.emit_itt():
+     <code-to-be-profiled...>
+
+Launch Intel® VTune™ Profiler
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+To verify the functionality, you need to start an Intel® VTune™ Profiler instance. Please check the `Intel® VTune™ Profiler User Guide <https://www.intel.com/content/www/us/en/develop/documentation/vtune-help/top/launch.html>`__ for steps to launch Intel® VTune™ Profiler.
+
+.. note:: 
+        Users can also use web-server-ui by following  `Intel® VTune™ Profiler Web Server UI Guide <https://www.intel.com/content/www/us/en/docs/vtune-profiler/user-guide/2024-1/web-server-ui.html>`__
+        ex :  vtune-backend --web-port=8080  --allow-remote-access --enable-server-profiling
+
+Once you get the Intel® VTune™ Profiler GUI launched, you should see a user interface as below:
+
+.. figure:: /_static/img/itt_tutorial/vtune_start.png
+   :width: 100%
+   :align: center
+
+Three sample results are available on the left side navigation bar under `sample (matrix)` project. If you do not want profiling results appear in this default sample project, you can create a new project via the button `New Project...` under the blue `Configure Analysis...` button. To start a new profiling, click the blue `Configure Analysis...` button to initiate configuration of the profiling.
+
+Configure Profiling for CPU
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Once you click the `Configure Analysis...` button, you should see the screen below:
+
+.. figure:: /_static/img/itt_tutorial/vtune_config.png
+   :width: 100%
+   :align: center
+
+The right side of the windows is split into 3 parts: `WHERE` (top left), `WHAT` (bottom left), and `HOW` (right). With `WHERE`, you can assign a machine where you want to run the profiling on. With `WHAT`, you can set the path of the application that you want to profile. To profile a PyTorch script, it is recommended to wrap all manual steps, including activating a Python environment and setting required environment variables, into a bash script, then profile this bash script. In the screenshot above, we wrapped all steps into the `launch.sh` bash script and profile `bash` with the parameter to be `<path_of_launch.sh>`. On the right side `HOW`, you can choose whatever type that you would like to profile. Intel® VTune™ Profiler provides a bunch of profiling types that you can choose from. Details can be found at `Intel® VTune™ Profiler User Guide <https://www.intel.com/content/www/us/en/develop/documentation/vtune-help/top/analyze-performance.html>`__.
+
+
+Configure Profiling for XPU
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Pick GPU Offload Profiling Type instead of Hotspots, and follow the same instructions as CPU to Launch the Application. 
+
+.. figure:: /_static/img/itt_tutorial/vtune_xpu_config.png
+   :width: 100%
+   :align: center
+
+
+Read Profiling Result
+~~~~~~~~~~~~~~~~~~~~~
+
+With a successful profiling with ITT, you can open `Platform` tab of the profiling result to see labels in the Intel® VTune™ Profiler timeline.
+
+.. figure:: /_static/img/itt_tutorial/vtune_timeline.png
+   :width: 100%
+   :align: center
+
+
+The timeline shows the main thread as a `python` thread on the top, and individual OpenMP threads below. Labeled PyTorch operators and customized regions are shown in the main thread row. All operators starting with `aten::` are operators labeled implicitly by the ITT feature in PyTorch. Labels `iteration_N` are explicitly labeled with specific APIs `torch.profiler.itt.range_push()`, `torch.profiler.itt.range_pop()` or `torch.profiler.itt.range()` scope. Please check the sample code in the next section for details.
+
+.. note::
+
+   Red boxes marked with `convolution` and `reorder` are labeled from Intel® oneAPI Deep Neural Network Library (oneDNN).
+
+As illustrated on the right side navigation bar, brown portions in the timeline rows show CPU usage of individual threads. The percerntage of height of a thread row that the brown portion occupies at a timestamp aligns with that of the CPU usage in that thread at that timestamp. Thus, it is intuitive from this timeline to understand the followings:
+
+1. How well CPU cores are utilized on each thread.
+2. How balance CPU cores are utilized on all threads. Do all threads have good CPU usage?
+3. How well OpenMP threads are synchronized. Are there jitters when starting OpenMP threads or OpenMP threads finish.
+
+Of course there are much more enriched sets of profiling features that Intel® VTune™ Profiler provides to help you understand a performance issue. When you understand the root cause of a performance issue, you can get it fixed. More detailed usage instructions are available at `Intel® VTune™ Profiler User Guide <https://www.intel.com/content/www/us/en/develop/documentation/vtune-help/top/analyze-performance.html>`__.
+
+Read XPU Profiling Result
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+With a successful profiling with ITT, you can open `Platform` tab of the profiling result to see labels in the Intel® VTune™ Profiler timeline.
+
+.. figure:: /_static/img/itt_tutorial/vtune_xpu_timeline.png
+   :width: 100%
+   :align: center
+
+
+The timeline shows the main thread as a `python` thread on the top. Labeled PyTorch operators and customized regions are shown in the main thread row. All operators starting with `aten::` are operators labeled implicitly by the ITT feature in PyTorch. The timeline also shows the GPU Computing Queue on the top, and users could see different XPU Kernels dispatched into GPU Queue.
+
+A short sample code showcasing how to use PyTorch ITT APIs
+----------------------------------------------------------
+
+The sample code below is the script that was used for profiling in the screenshots above.
+
+The topology is formed by two operators, `Conv2d` and `Linear`. Three iterations of inference were performed. Each iteration was labeled by PyTorch ITT APIs as text string `iteration_N`. Either pair of `torch.profile.itt.range_push` and `torch.profile.itt.range_pop` or `torch.profile.itt.range` scope does the customized labeling feature.
+
+.. code:: python3
+
+   # sample.py
+
+   import torch
+   import torch.nn as nn
+   
+   class ITTSample(nn.Module):
+     def __init__(self):
+       super(ITTSample, self).__init__()
+       self.conv = nn.Conv2d(3, 5, 3)
+       self.linear = nn.Linear(292820, 1000)
+   
+     def forward(self, x):
+       x = self.conv(x)
+       x = x.view(x.shape[0], -1)
+       x = self.linear(x)
+       return x
+   
+   def main():
+     m = ITTSample
+     # unmark below code for XPU
+     # m = m.to("xpu")
+     x = torch.rand(10, 3, 244, 244)
+     # unmark below code for XPU
+     # x = x.to("xpu")
+     with torch.autograd.profiler.emit_itt():
+       for i in range(3)
+         # Labeling a region with pair of range_push and range_pop
+         #torch.profiler.itt.range_push(f'iteration_{i}')
+         #m(x)
+         #torch.profiler.itt.range_pop()
+   
+         # Labeling a region with range scope
+         with torch.profiler.itt.range(f'iteration_{i}'):
+           m(x)
+   
+   if __name__ == '__main__':
+     main()
+
+
+The `launch.sh` bash script, mentioned in the Intel® VTune™ Profiler GUI screenshot, to wrap all manual steps is shown below.
+
+.. code:: bash
+
+   # launch.sh
+
+   #!/bin/bash
+   
+   # Retrieve the directory path where the path contains both the sample.py and launch.sh so that this bash script can be invoked from any directory
+   BASEFOLDER=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+   <Activate a Python environment>
+   cd ${BASEFOLDER}
+   python sample.py
diff --git a/recipes_source/ptmobile_recipes_summary.rst b/recipes_source/ptmobile_recipes_summary.rst
new file mode 100644
index 00000000000..fdf9f58e43d
--- /dev/null
+++ b/recipes_source/ptmobile_recipes_summary.rst
@@ -0,0 +1,10 @@
+Summary of PyTorch Mobile Recipes
+=====================================
+
+PyTorch Mobile is no longer actively supported. Please check out `ExecuTorch <https://github.com/pytorch/executorch>`__.
+
+Redirecting in 3 seconds...
+
+.. raw:: html
+
+    <meta http-equiv="Refresh" content="3; url='https://pytorch.org/executorch/stable/index.html'" />
diff --git a/recipes_source/recipes/Captum_Recipe.py b/recipes_source/recipes/Captum_Recipe.py
new file mode 100644
index 00000000000..11fdc24429c
--- /dev/null
+++ b/recipes_source/recipes/Captum_Recipe.py
@@ -0,0 +1,190 @@
+"""
+Model Interpretability using Captum
+===================================
+
+"""
+
+
+######################################################################
+# Captum helps you understand how the data features impact your model
+# predictions or neuron activations, shedding light on how your model
+# operates.
+# 
+# Using Captum, you can apply a wide range of state-of-the-art feature
+# attribution algorithms such as \ ``Guided GradCam``\  and
+# \ ``Integrated Gradients``\  in a unified way.
+# 
+# In this recipe you will learn how to use Captum to: 
+#
+# - Attribute the predictions of an image classifier to their corresponding image features. 
+# - Visualize the attribution results.
+# 
+
+
+######################################################################
+# Before you begin
+# ----------------
+# 
+
+
+######################################################################
+# Make sure Captum is installed in your active Python environment. Captum
+# is available both on GitHub, as a ``pip`` package, or as a ``conda``
+# package. For detailed instructions, consult the installation guide at
+# https://captum.ai/
+# 
+
+
+######################################################################
+# For a model, we use a built-in image classifier in PyTorch. Captum can
+# reveal which parts of a sample image support certain predictions made by
+# the model.
+# 
+
+import torchvision
+from torchvision import models, transforms
+from PIL import Image
+import requests
+from io import BytesIO
+
+model = torchvision.models.resnet18(weights=models.ResNet18_Weights.IMAGENET1K_V1).eval()
+
+response = requests.get("https://image.freepik.com/free-photo/two-beautiful-puppies-cat-dog_58409-6024.jpg")
+img = Image.open(BytesIO(response.content))
+
+center_crop = transforms.Compose([
+ transforms.Resize(256),
+ transforms.CenterCrop(224),
+])
+
+normalize = transforms.Compose([
+    transforms.ToTensor(),               # converts the image to a tensor with values between 0 and 1
+    transforms.Normalize(                # normalize to follow 0-centered imagenet pixel RGB distribution
+     mean=[0.485, 0.456, 0.406],
+     std=[0.229, 0.224, 0.225]
+    )
+])
+input_img = normalize(center_crop(img)).unsqueeze(0)
+
+
+######################################################################
+# Computing Attribution
+# ---------------------
+# 
+
+
+######################################################################
+# Among the top-3 predictions of the models are classes 208 and 283 which
+# correspond to dog and cat.
+# 
+# Let us attribute each of these predictions to the corresponding part of
+# the input, using Captum’s \ ``Occlusion``\  algorithm.
+# 
+
+from captum.attr import Occlusion 
+
+occlusion = Occlusion(model)
+
+strides = (3, 9, 9)               # smaller = more fine-grained attribution but slower
+target=208,                       # Labrador index in ImageNet 
+sliding_window_shapes=(3,45, 45)  # choose size enough to change object appearance
+baselines = 0                     # values to occlude the image with. 0 corresponds to gray
+
+attribution_dog = occlusion.attribute(input_img,
+                                       strides = strides,
+                                       target=target,
+                                       sliding_window_shapes=sliding_window_shapes,
+                                       baselines=baselines)
+
+
+target=283,                       # Persian cat index in ImageNet 
+attribution_cat = occlusion.attribute(input_img,
+                                       strides = strides,
+                                       target=target,
+                                       sliding_window_shapes=sliding_window_shapes,
+                                       baselines=0)
+
+
+######################################################################
+# Besides ``Occlusion``, Captum features many algorithms such as
+# \ ``Integrated Gradients``\ , \ ``Deconvolution``\ ,
+# \ ``GuidedBackprop``\ , \ ``Guided GradCam``\ , \ ``DeepLift``\ , and
+# \ ``GradientShap``\ . All of these algorithms are subclasses of
+# ``Attribution`` which expects your model as a callable ``forward_func``
+# upon initialization and has an ``attribute(...)`` method which returns
+# the attribution result in a unified format.
+# 
+# Let us visualize the computed attribution results in case of images.
+# 
+
+
+######################################################################
+# Visualizing the Results
+# -----------------------
+# 
+
+
+######################################################################
+# Captum’s \ ``visualization``\  utility provides out-of-the-box methods
+# to visualize attribution results both for pictorial and for textual
+# inputs.
+# 
+
+import numpy as np
+from captum.attr import visualization as viz
+
+# Convert the compute attribution tensor into an image-like numpy array
+attribution_dog = np.transpose(attribution_dog.squeeze().cpu().detach().numpy(), (1,2,0))
+
+vis_types = ["heat_map", "original_image"]
+vis_signs = ["all", "all"] # "positive", "negative", or "all" to show both
+# positive attribution indicates that the presence of the area increases the prediction score
+# negative attribution indicates distractor areas whose absence increases the score
+
+_ = viz.visualize_image_attr_multiple(attribution_dog,
+                                      np.array(center_crop(img)),
+                                      vis_types,
+                                      vis_signs,
+                                      ["attribution for dog", "image"],
+                                      show_colorbar = True
+                                     )
+
+
+attribution_cat = np.transpose(attribution_cat.squeeze().cpu().detach().numpy(), (1,2,0))
+
+_ = viz.visualize_image_attr_multiple(attribution_cat,
+                                      np.array(center_crop(img)),
+                                      ["heat_map", "original_image"],  
+                                      ["all", "all"], # positive/negative attribution or all
+                                      ["attribution for cat", "image"],
+                                      show_colorbar = True
+                                     )
+
+
+######################################################################
+# If your data is textual, ``visualization.visualize_text()`` offers a
+# dedicated view to explore attribution on top of the input text. Find out
+# more at http://captum.ai/tutorials/IMDB_TorchText_Interpret
+# 
+
+
+######################################################################
+# Final Notes
+# -----------
+# 
+
+
+######################################################################
+# Captum can handle most model types in PyTorch across modalities
+# including vision, text, and more. With Captum you can: \* Attribute a
+# specific output to the model input as illustrated above. \* Attribute a
+# specific output to a hidden-layer neuron (see Captum API reference). \*
+# Attribute a hidden-layer neuron response to the model input (see Captum
+# API reference).
+# 
+# For complete API of the supported methods and a list of tutorials,
+# consult our website http://captum.ai
+# 
+# Another useful post by Gilbert Tanner:
+# https://gilberttanner.com/blog/interpreting-pytorch-models-with-captum
+# 
diff --git a/recipes_source/recipes/README.txt b/recipes_source/recipes/README.txt
new file mode 100644
index 00000000000..4ed6d351ae3
--- /dev/null
+++ b/recipes_source/recipes/README.txt
@@ -0,0 +1,46 @@
+PyTorch Recipes
+---------------------------------------------
+1. defining_a_neural_network.py
+	 Defining a Neural Network in PyTorch
+         https://pytorch.org/tutorials/recipes/recipes/defining_a_neural_network.html
+
+2. what_is_state_dict.py
+	 What is a state_dict in PyTorch
+         https://pytorch.org/tutorials/recipes/recipes/what_is_state_dict.html
+
+3. saving_and_loading_models_for_inference.py
+	 Saving and loading models for inference in PyTorch
+         https://pytorch.org/tutorials/recipes/recipes/saving_and_loading_models_for_inference.html
+
+4. custom_dataset_transforms_loader.py
+	 Developing Custom PyTorch Dataloaders
+         https://pytorch.org/tutorials/recipes/recipes/custom_dataset_transforms_loader.html
+
+
+5. Captum_Recipe.py
+	 Model Interpretability using Captum
+         https://pytorch.org/tutorials/recipes/recipes/Captum_Recipe.html
+
+6. dynamic_quantization.py
+         Dynamic Quantization
+         https://pytorch.org/tutorials/recipes/recipes/dynamic_quantization.html
+
+7. warmstarting_model_using_parameters_from_a_different_model.py
+         Warmstarting models using parameters from different model
+         https://pytorch.org/tutorials/recipes/recipes/warmstarting_model_using_parameters_from_a_different_model.html
+
+8. zeroing_out_gradients.py
+         Zeroing out gradients
+         https://pytorch.org/tutorials/recipes/recipes/zeroing_out_gradients.html
+
+9. mobile_perf.py
+         PyTorch Mobile Performance Recipes
+         https://pytorch.org/tutorials/recipes/mobile_perf.html
+
+10. amp_recipe.py
+         Automatic Mixed Precision
+         https://pytorch.org/tutorials/recipes/amp_recipe.html
+
+11. regional_compilation.py
+	Reducing torch.compile cold start compilation time with regional compilation
+         https://pytorch.org/tutorials/recipes/regional_compilation.html
diff --git a/recipes_source/recipes/amp_recipe.py b/recipes_source/recipes/amp_recipe.py
new file mode 100644
index 00000000000..91ce19a93a9
--- /dev/null
+++ b/recipes_source/recipes/amp_recipe.py
@@ -0,0 +1,331 @@
+# -*- coding: utf-8 -*-
+"""
+Automatic Mixed Precision
+*************************
+**Author**: `Michael Carilli <https://github.com/mcarilli>`_
+
+`torch.cuda.amp <https://pytorch.org/docs/stable/amp.html>`_ provides convenience methods for mixed precision,
+where some operations use the ``torch.float32`` (``float``) datatype and other operations
+use ``torch.float16`` (``half``). Some ops, like linear layers and convolutions,
+are much faster in ``float16`` or ``bfloat16``. Other ops, like reductions, often require the dynamic
+range of ``float32``.  Mixed precision tries to match each op to its appropriate datatype,
+which can reduce your network's runtime and memory footprint.
+
+Ordinarily, "automatic mixed precision training" uses `torch.autocast <https://pytorch.org/docs/stable/amp.html#torch.autocast>`_ and
+`torch.cuda.amp.GradScaler <https://pytorch.org/docs/stable/amp.html#torch.cuda.amp.GradScaler>`_ together.
+
+This recipe measures the performance of a simple network in default precision,
+then walks through adding ``autocast`` and ``GradScaler`` to run the same network in
+mixed precision with improved performance.
+
+You may download and run this recipe as a standalone Python script.
+The only requirements are PyTorch 1.6 or later and a CUDA-capable GPU.
+
+Mixed precision primarily benefits Tensor Core-enabled architectures (Volta, Turing, Ampere).
+This recipe should show significant (2-3X) speedup on those architectures.
+On earlier architectures (Kepler, Maxwell, Pascal), you may observe a modest speedup.
+Run ``nvidia-smi`` to display your GPU's architecture.
+"""
+
+import torch, time, gc
+
+# Timing utilities
+start_time = None
+
+def start_timer():
+    global start_time
+    gc.collect()
+    torch.cuda.empty_cache()
+    torch.cuda.reset_max_memory_allocated()
+    torch.cuda.synchronize()
+    start_time = time.time()
+
+def end_timer_and_print(local_msg):
+    torch.cuda.synchronize()
+    end_time = time.time()
+    print("\n" + local_msg)
+    print("Total execution time = {:.3f} sec".format(end_time - start_time))
+    print("Max memory used by tensors = {} bytes".format(torch.cuda.max_memory_allocated()))
+
+##########################################################
+# A simple network
+# ----------------
+# The following sequence of linear layers and ReLUs should show a speedup with mixed precision.
+
+def make_model(in_size, out_size, num_layers):
+    layers = []
+    for _ in range(num_layers - 1):
+        layers.append(torch.nn.Linear(in_size, in_size))
+        layers.append(torch.nn.ReLU())
+    layers.append(torch.nn.Linear(in_size, out_size))
+    return torch.nn.Sequential(*tuple(layers)).cuda()
+
+##########################################################
+# ``batch_size``, ``in_size``, ``out_size``, and ``num_layers`` are chosen to be large enough to saturate the GPU with work.
+# Typically, mixed precision provides the greatest speedup when the GPU is saturated.
+# Small networks may be CPU bound, in which case mixed precision won't improve performance.
+# Sizes are also chosen such that linear layers' participating dimensions are multiples of 8,
+# to permit Tensor Core usage on Tensor Core-capable GPUs (see :ref:`Troubleshooting<troubleshooting>` below).
+#
+# Exercise: Vary participating sizes and see how the mixed precision speedup changes.
+
+batch_size = 512 # Try, for example, 128, 256, 513.
+in_size = 4096
+out_size = 4096
+num_layers = 3
+num_batches = 50
+epochs = 3
+
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+torch.set_default_device(device)
+
+# Creates data in default precision.
+# The same data is used for both default and mixed precision trials below.
+# You don't need to manually change inputs' ``dtype`` when enabling mixed precision.
+data = [torch.randn(batch_size, in_size) for _ in range(num_batches)]
+targets = [torch.randn(batch_size, out_size) for _ in range(num_batches)]
+
+loss_fn = torch.nn.MSELoss().cuda()
+
+##########################################################
+# Default Precision
+# -----------------
+# Without ``torch.cuda.amp``, the following simple network executes all ops in default precision (``torch.float32``):
+
+net = make_model(in_size, out_size, num_layers)
+opt = torch.optim.SGD(net.parameters(), lr=0.001)
+
+start_timer()
+for epoch in range(epochs):
+    for input, target in zip(data, targets):
+        output = net(input)
+        loss = loss_fn(output, target)
+        loss.backward()
+        opt.step()
+        opt.zero_grad() # set_to_none=True here can modestly improve performance
+end_timer_and_print("Default precision:")
+
+##########################################################
+# Adding ``torch.autocast``
+# -------------------------
+# Instances of `torch.autocast <https://pytorch.org/docs/stable/amp.html#autocasting>`_
+# serve as context managers that allow regions of your script to run in mixed precision.
+#
+# In these regions, CUDA ops run in a ``dtype`` chosen by ``autocast``
+# to improve performance while maintaining accuracy.
+# See the `Autocast Op Reference <https://pytorch.org/docs/stable/amp.html#autocast-op-reference>`_
+# for details on what precision ``autocast`` chooses for each op, and under what circumstances.
+
+for epoch in range(0): # 0 epochs, this section is for illustration only
+    for input, target in zip(data, targets):
+        # Runs the forward pass under ``autocast``.
+        with torch.autocast(device_type=device, dtype=torch.float16):
+            output = net(input)
+            # output is float16 because linear layers ``autocast`` to float16.
+            assert output.dtype is torch.float16
+
+            loss = loss_fn(output, target)
+            # loss is float32 because ``mse_loss`` layers ``autocast`` to float32.
+            assert loss.dtype is torch.float32
+
+        # Exits ``autocast`` before backward().
+        # Backward passes under ``autocast`` are not recommended.
+        # Backward ops run in the same ``dtype`` ``autocast`` chose for corresponding forward ops.
+        loss.backward()
+        opt.step()
+        opt.zero_grad() # set_to_none=True here can modestly improve performance
+
+##########################################################
+# Adding ``GradScaler``
+# ---------------------
+# `Gradient scaling <https://pytorch.org/docs/stable/amp.html#gradient-scaling>`_
+# helps prevent gradients with small magnitudes from flushing to zero
+# ("underflowing") when training with mixed precision.
+#
+# `torch.cuda.amp.GradScaler <https://pytorch.org/docs/stable/amp.html#torch.cuda.amp.GradScaler>`_
+# performs the steps of gradient scaling conveniently.
+
+# Constructs a ``scaler`` once, at the beginning of the convergence run, using default arguments.
+# If your network fails to converge with default ``GradScaler`` arguments, please file an issue.
+# The same ``GradScaler`` instance should be used for the entire convergence run.
+# If you perform multiple convergence runs in the same script, each run should use
+# a dedicated fresh ``GradScaler`` instance. ``GradScaler`` instances are lightweight.
+scaler = torch.amp.GradScaler("cuda")
+
+for epoch in range(0): # 0 epochs, this section is for illustration only
+    for input, target in zip(data, targets):
+        with torch.autocast(device_type=device, dtype=torch.float16):
+            output = net(input)
+            loss = loss_fn(output, target)
+
+        # Scales loss. Calls ``backward()`` on scaled loss to create scaled gradients.
+        scaler.scale(loss).backward()
+
+        # ``scaler.step()`` first unscales the gradients of the optimizer's assigned parameters.
+        # If these gradients do not contain ``inf``s or ``NaN``s, optimizer.step() is then called,
+        # otherwise, optimizer.step() is skipped.
+        scaler.step(opt)
+
+        # Updates the scale for next iteration.
+        scaler.update()
+
+        opt.zero_grad() # set_to_none=True here can modestly improve performance
+
+##########################################################
+# All together: "Automatic Mixed Precision"
+# ------------------------------------------
+# (The following also demonstrates ``enabled``, an optional convenience argument to ``autocast`` and ``GradScaler``.
+# If False, ``autocast`` and ``GradScaler``\ 's calls become no-ops.
+# This allows switching between default precision and mixed precision without if/else statements.)
+
+use_amp = True
+
+net = make_model(in_size, out_size, num_layers)
+opt = torch.optim.SGD(net.parameters(), lr=0.001)
+scaler = torch.amp.GradScaler("cuda" ,enabled=use_amp)
+
+start_timer()
+for epoch in range(epochs):
+    for input, target in zip(data, targets):
+        with torch.autocast(device_type=device, dtype=torch.float16, enabled=use_amp):
+            output = net(input)
+            loss = loss_fn(output, target)
+        scaler.scale(loss).backward()
+        scaler.step(opt)
+        scaler.update()
+        opt.zero_grad() # set_to_none=True here can modestly improve performance
+end_timer_and_print("Mixed precision:")
+
+##########################################################
+# Inspecting/modifying gradients (e.g., clipping)
+# --------------------------------------------------------
+# All gradients produced by ``scaler.scale(loss).backward()`` are scaled.  If you wish to modify or inspect
+# the parameters' ``.grad`` attributes between ``backward()`` and ``scaler.step(optimizer)``, you should
+# unscale them first using `scaler.unscale_(optimizer) <https://pytorch.org/docs/stable/amp.html#torch.cuda.amp.GradScaler.unscale_>`_.
+
+for epoch in range(0): # 0 epochs, this section is for illustration only
+    for input, target in zip(data, targets):
+        with torch.autocast(device_type=device, dtype=torch.float16):
+            output = net(input)
+            loss = loss_fn(output, target)
+        scaler.scale(loss).backward()
+
+        # Unscales the gradients of optimizer's assigned parameters in-place
+        scaler.unscale_(opt)
+
+        # Since the gradients of optimizer's assigned parameters are now unscaled, clips as usual.
+        # You may use the same value for max_norm here as you would without gradient scaling.
+        torch.nn.utils.clip_grad_norm_(net.parameters(), max_norm=0.1)
+
+        scaler.step(opt)
+        scaler.update()
+        opt.zero_grad() # set_to_none=True here can modestly improve performance
+
+##########################################################
+# Saving/Resuming
+# ----------------
+# To save/resume Amp-enabled runs with bitwise accuracy, use
+# `scaler.state_dict <https://pytorch.org/docs/stable/amp.html#torch.cuda.amp.GradScaler.state_dict>`_ and
+# `scaler.load_state_dict <https://pytorch.org/docs/stable/amp.html#torch.cuda.amp.GradScaler.load_state_dict>`_.
+#
+# When saving, save the ``scaler`` state dict alongside the usual model and optimizer state ``dicts``.
+# Do this either at the beginning of an iteration before any forward passes, or at the end of
+# an iteration after ``scaler.update()``.
+
+checkpoint = {"model": net.state_dict(),
+              "optimizer": opt.state_dict(),
+              "scaler": scaler.state_dict()}
+# Write checkpoint as desired, e.g.,
+# torch.save(checkpoint, "filename")
+
+##########################################################
+# When resuming, load the ``scaler`` state dict alongside the model and optimizer state ``dicts``.
+# Read checkpoint as desired, for example:
+#
+# .. code-block::
+#
+#    dev = torch.cuda.current_device()
+#    checkpoint = torch.load("filename",
+#                            map_location = lambda storage, loc: storage.cuda(dev))
+#
+net.load_state_dict(checkpoint["model"])
+opt.load_state_dict(checkpoint["optimizer"])
+scaler.load_state_dict(checkpoint["scaler"])
+
+##########################################################
+# If a checkpoint was created from a run *without* Amp, and you want to resume training *with* Amp,
+# load model and optimizer states from the checkpoint as usual.  The checkpoint won't contain a saved ``scaler`` state, so
+# use a fresh instance of ``GradScaler``.
+#
+# If a checkpoint was created from a run *with* Amp and you want to resume training *without* ``Amp``,
+# load model and optimizer states from the checkpoint as usual, and ignore the saved ``scaler`` state.
+
+##########################################################
+# Inference/Evaluation
+# --------------------
+# ``autocast`` may be used by itself to wrap inference or evaluation forward passes. ``GradScaler`` is not necessary.
+
+##########################################################
+# .. _advanced-topics:
+#
+# Advanced topics
+# ---------------
+# See the `Automatic Mixed Precision Examples <https://pytorch.org/docs/stable/notes/amp_examples.html>`_ for advanced use cases including:
+#
+# * Gradient accumulation
+# * Gradient penalty/double backward
+# * Networks with multiple models, optimizers, or losses
+# * Multiple GPUs (``torch.nn.DataParallel`` or ``torch.nn.parallel.DistributedDataParallel``)
+# * Custom autograd functions (subclasses of ``torch.autograd.Function``)
+#
+# If you perform multiple convergence runs in the same script, each run should use
+# a dedicated fresh ``GradScaler`` instance. ``GradScaler`` instances are lightweight.
+#
+# If you're registering a custom C++ op with the dispatcher, see the
+# `autocast section <https://pytorch.org/tutorials/advanced/dispatcher.html#autocast>`_
+# of the dispatcher tutorial.
+
+##########################################################
+# .. _troubleshooting:
+#
+# Troubleshooting
+# ---------------
+# Speedup with Amp is minor
+# ~~~~~~~~~~~~~~~~~~~~~~~~~
+# 1. Your network may fail to saturate the GPU(s) with work, and is therefore CPU bound. Amp's effect on GPU performance
+#    won't matter.
+#
+#    * A rough rule of thumb to saturate the GPU is to increase batch and/or network size(s)
+#      as much as you can without running OOM.
+#    * Try to avoid excessive CPU-GPU synchronization (``.item()`` calls, or printing values from CUDA tensors).
+#    * Try to avoid sequences of many small CUDA ops (coalesce these into a few large CUDA ops if you can).
+# 2. Your network may be GPU compute bound (lots of ``matmuls``/convolutions) but your GPU does not have Tensor Cores.
+#    In this case a reduced speedup is expected.
+# 3. The ``matmul`` dimensions are not Tensor Core-friendly.  Make sure ``matmuls`` participating sizes are multiples of 8.
+#    (For NLP models with encoders/decoders, this can be subtle.  Also, convolutions used to have similar size constraints
+#    for Tensor Core use, but for CuDNN versions 7.3 and later, no such constraints exist.  See
+#    `here <https://github.com/NVIDIA/apex/issues/221#issuecomment-478084841>`_ for guidance.)
+#
+# Loss is inf/NaN
+# ~~~~~~~~~~~~~~~
+# First, check if your network fits an :ref:`advanced use case<advanced-topics>`.
+# See also `Prefer binary_cross_entropy_with_logits over binary_cross_entropy <https://pytorch.org/docs/stable/amp.html#prefer-binary-cross-entropy-with-logits-over-binary-cross-entropy>`_.
+#
+# If you're confident your Amp usage is correct, you may need to file an issue, but before doing so, it's helpful to gather the following information:
+#
+# 1. Disable ``autocast`` or ``GradScaler`` individually (by passing ``enabled=False`` to their constructor) and see if ``infs``/``NaNs`` persist.
+# 2. If you suspect part of your network (e.g., a complicated loss function) overflows , run that forward region in ``float32``
+#    and see if ``infs``/``NaN``s persist.
+#    `The autocast docstring <https://pytorch.org/docs/stable/amp.html#torch.autocast>`_'s last code snippet
+#    shows forcing a subregion to run in ``float32`` (by locally disabling ``autocast`` and casting the subregion's inputs).
+#
+# Type mismatch error (may manifest as ``CUDNN_STATUS_BAD_PARAM``)
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# ``Autocast`` tries to cover all ops that benefit from or require casting.
+# `Ops that receive explicit coverage <https://pytorch.org/docs/stable/amp.html#autocast-op-reference>`_
+# are chosen based on numerical properties, but also on experience.
+# If you see a type mismatch error in an ``autocast`` enabled forward region or a backward pass following that region,
+# it's possible ``autocast`` missed an op.
+#
+# Please file an issue with the error backtrace.  ``export TORCH_SHOW_CPP_STACKTRACES=1`` before running your script to provide
+# fine-grained information on which backend op is failing.
diff --git a/recipes_source/recipes/benchmark.py b/recipes_source/recipes/benchmark.py
new file mode 100644
index 00000000000..96fdf109bde
--- /dev/null
+++ b/recipes_source/recipes/benchmark.py
@@ -0,0 +1,888 @@
+"""
+PyTorch Benchmark
+====================================
+This recipe provides a quick-start guide to using PyTorch
+``benchmark`` module to measure and compare code performance.
+
+Introduction
+------------
+Benchmarking is an important step in writing code. It helps
+us validate that our code meets performance expectations,
+compare different approaches to solving the same problem and
+prevent performance regressions.
+
+There are many options when it comes to benchmarking PyTorch code
+including the Python builtin ``timeit`` module. However, benchmarking
+PyTorch code has many caveats that can be easily overlooked such as
+managing the number of threads and synchronizing CUDA devices. Moreover,
+generating Tensor inputs for benchmarking can be quite tedious.
+
+This recipe demonstrates how to use PyTorch ``benchmark`` module to avoid
+common mistakes while making it easier to compare performance of
+different code, generate input for benchmarking and more.
+
+Setup
+-----
+Before we begin, install ``torch`` if it isn’t already available.
+
+::
+
+   pip install torch
+
+"""
+
+
+######################################################################
+# Steps
+# -----
+#
+# 1. Defining functions to benchmark
+# 2. Benchmarking with ``timeit.Timer``
+# 3. Benchmarking with ``torch.utils.benchmark.Timer``
+# 4. Benchmarking with ``Blocked Autorange``
+# 5. Comparing benchmark results
+# 6. Saving/Loading benchmark results
+# 7. Generating inputs with ``Fuzzed Parameters``
+# 8. Collecting instruction counts with ``Callgrind``
+#
+# 1. Defining functions to benchmark
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# As of the time of this writing, `torch.dot <https://pytorch.org/docs/stable/generated/torch.dot.html?highlight=dot#torch.dot>`__
+# does not support batched mode, so we will compare two approaches to
+# implementing it using existing ``torch`` operators: one approach uses a
+# combination of ``mul`` and ``sum`` while the other reduces the problem to ``bmm``.
+#
+
+import torch
+
+
+def batched_dot_mul_sum(a, b):
+    '''Computes batched dot by multiplying and summing'''
+    return a.mul(b).sum(-1)
+
+
+def batched_dot_bmm(a, b):
+    '''Computes batched dot by reducing to ``bmm``'''
+    a = a.reshape(-1, 1, a.shape[-1])
+    b = b.reshape(-1, b.shape[-1], 1)
+    return torch.bmm(a, b).flatten(-3)
+
+
+# Input for benchmarking
+x = torch.randn(10000, 64)
+
+# Ensure that both functions compute the same output
+assert batched_dot_mul_sum(x, x).allclose(batched_dot_bmm(x, x))
+
+
+######################################################################
+# 2. Benchmarking with ``timeit.Timer``
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# First, let's benchmark the code using Python's builtin ``timeit`` module.
+# We keep the benchmark code simple here so we can compare the defaults
+# of ``timeit`` and ``torch.utils.benchmark``.
+#
+
+import timeit
+
+t0 = timeit.Timer(
+    stmt='batched_dot_mul_sum(x, x)', 
+    setup='from __main__ import batched_dot_mul_sum',
+    globals={'x': x})
+
+t1 = timeit.Timer(
+    stmt='batched_dot_bmm(x, x)',
+    setup='from __main__ import batched_dot_bmm',
+    globals={'x': x})
+
+print(f'mul_sum(x, x):  {t0.timeit(100) / 100 * 1e6:>5.1f} us')
+print(f'bmm(x, x):      {t1.timeit(100) / 100 * 1e6:>5.1f} us')
+
+######################################################################
+# .. code-block:: none
+#    :caption: Output
+#
+#     mul_sum(x, x):  111.6 us
+#     bmm(x, x):       70.0 us
+#
+
+
+######################################################################
+# 3. Benchmarking with ``torch.utils.benchmark.Timer``
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# PyTorch ``benchmark`` module was designed to be familiar to those who
+# have used the ``timeit`` module before. However, its defaults make it
+# easier and safer to use for benchmarking PyTorch code. Let's first
+# compare the same basic API as above.
+#
+
+import torch.utils.benchmark as benchmark
+
+t0 = benchmark.Timer(
+    stmt='batched_dot_mul_sum(x, x)', 
+    setup='from __main__ import batched_dot_mul_sum',
+    globals={'x': x})
+
+t1 = benchmark.Timer(
+    stmt='batched_dot_bmm(x, x)',
+    setup='from __main__ import batched_dot_bmm',
+    globals={'x': x})
+
+print(t0.timeit(100))
+print(t1.timeit(100))
+
+######################################################################
+# .. code-block:: none
+#    :caption: Output
+#
+#     <torch.utils.benchmark.utils.common.Measurement object at 0x7fb10400d0f0>
+#     batched_dot_mul_sum(x, x)
+#     setup: from __main__ import batched_dot_mul_sum
+#       379.29 us
+#       1 measurement, 100 runs , 1 thread
+#     <torch.utils.benchmark.utils.common.Measurement object at 0x7fb103d67048>
+#     batched_dot_bmm(x, x)
+#     setup: from __main__ import batched_dot_bmm
+#       716.42 us
+#       1 measurement, 100 runs , 1 thread
+#
+
+######################################################################
+# Even though the APIs are the same for the basic functionality, there
+# are some important differences. ``benchmark.Timer.timeit()`` returns the
+# time per run as opposed to the total runtime like ``timeit.Timer.timeit()``
+# does. PyTorch ``benchmark`` module also provides formatted string
+# representations for printing the results.
+#
+# Another important difference, and the reason why the results diverge
+# is that PyTorch benchmark module runs in a single thread by default.
+# We can change the number of threads with the ``num_threads`` argument.
+#
+# ``torch.utils.benchmark.Timer`` takes several additional arguments
+# including: ``label``, ``sub_label``, ``description`` and ``env`` which change
+# the __repr__ of the measurement object returned and are used for
+# grouping the results (more on this later).
+#
+
+num_threads = torch.get_num_threads()
+print(f'Benchmarking on {num_threads} threads')
+
+t0 = benchmark.Timer(
+    stmt='batched_dot_mul_sum(x, x)', 
+    setup='from __main__ import batched_dot_mul_sum',
+    globals={'x': x},
+    num_threads=num_threads,
+    label='Multithreaded batch dot',
+    sub_label='Implemented using mul and sum')
+
+t1 = benchmark.Timer(
+    stmt='batched_dot_bmm(x, x)',
+    setup='from __main__ import batched_dot_bmm',
+    globals={'x': x},
+    num_threads=num_threads,
+    label='Multithreaded batch dot',
+    sub_label='Implemented using bmm')
+
+print(t0.timeit(100))
+print(t1.timeit(100))
+
+######################################################################
+# .. code-block:: none
+#    :caption: Output
+#
+#     Benchmarking on 40 threads
+#     <torch.utils.benchmark.utils.common.Measurement object at 0x7fb103d54080>
+#     Multithreaded batch dot: Implemented using mul and sum
+#     setup: from __main__ import batched_dot_mul_sum
+#       118.47 us
+#       1 measurement, 100 runs , 40 threads
+#     <torch.utils.benchmark.utils.common.Measurement object at 0x7fb16935d2e8>
+#     Multithreaded batch dot: Implemented using bmm
+#     setup: from __main__ import batched_dot_bmm
+#       68.21 us
+#       1 measurement, 100 runs , 40 threads
+
+######################################################################
+# Running ``benchmark`` with all threads available gives similar results
+# as the ``timeit`` module. More importantly, which version is faster
+# depends on how many threads we run the code with. This is why it's
+# important to benchmark the code with thread settings that are
+# representative of real use cases. Another important thing to remember
+# is to synchronize CPU and CUDA when benchmarking on the GPU. Let's run
+# the above benchmarks again on a CUDA tensor and see what happens.
+#
+
+x = torch.randn(10000, 1024, device='cuda')
+
+t0 = timeit.Timer(
+    stmt='batched_dot_mul_sum(x, x)', 
+    setup='from __main__ import batched_dot_mul_sum',
+    globals={'x': x})
+
+t1 = timeit.Timer(
+    stmt='batched_dot_bmm(x, x)',
+    setup='from __main__ import batched_dot_bmm',
+    globals={'x': x})
+
+# Ran each twice to show difference before/after warm-up
+print(f'mul_sum(x, x):  {t0.timeit(100) / 100 * 1e6:>5.1f} us')
+print(f'mul_sum(x, x):  {t0.timeit(100) / 100 * 1e6:>5.1f} us')
+print(f'bmm(x, x):      {t1.timeit(100) / 100 * 1e6:>5.1f} us')
+print(f'bmm(x, x):      {t1.timeit(100) / 100 * 1e6:>5.1f} us')
+
+######################################################################
+# .. code-block:: none
+#    :caption: Output
+#
+#     mul_sum(x, x):   27.6 us
+#     mul_sum(x, x):   25.3 us
+#     bmm(x, x):      2775.5 us
+#     bmm(x, x):       22.4 us
+#
+
+t0 = benchmark.Timer(
+    stmt='batched_dot_mul_sum(x, x)', 
+    setup='from __main__ import batched_dot_mul_sum',
+    globals={'x': x})
+
+t1 = benchmark.Timer(
+    stmt='batched_dot_bmm(x, x)',
+    setup='from __main__ import batched_dot_bmm',
+    globals={'x': x})
+
+# Run only once since benchmark module does warm-up for us
+print(t0.timeit(100))
+print(t1.timeit(100))
+
+######################################################################
+# .. code-block:: none
+#    :caption: Output
+#
+#     <torch.utils.benchmark.utils.common.Measurement object at 0x7fb10400d080>
+#     batched_dot_mul_sum(x, x)
+#     setup: from __main__ import batched_dot_mul_sum
+#       232.93 us
+#       1 measurement, 100 runs , 1 thread
+#     <torch.utils.benchmark.utils.common.Measurement object at 0x7fb10400d0f0>
+#     batched_dot_bmm(x, x)
+#     setup: from __main__ import batched_dot_bmm
+#       181.04 us
+#       1 measurement, 100 runs , 1 thread
+#
+
+######################################################################
+# The results reveal something interesting. The first run of the ``bmm``
+# version using the ``timeit`` module takes much longer than the second
+# run. This is because ``bmm`` calls into `cuBLAS` which needs to be
+# loaded the first time it's called which takes some time. This is why
+# it's important to do a warm-up run before benchmarking, luckily for
+# us, PyTorch's ``benchmark`` module takes care of that.
+#
+# The difference in the results between ``timeit`` and ``benchmark`` modules
+# is because the `timeit` module is not synchronizing CUDA and is thus only
+# timing the time to launch the kernel. PyTorch's ``benchmark`` module does
+# the synchronization for us.
+
+
+######################################################################
+# 4. Benchmarking with `Blocked Autorange`
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# While ``timeit.Timer.autorange`` takes a single continuous measurement
+# of at least 0.2 seconds, `torch.utils.benchmark.Timer.blocked_autorange`
+# takes many measurements whose times total at least 0.2 seconds (which
+# can be changed by the `min_run_time` parameter) subject to the constraint
+# that timing overhead is a small fraction of the overall measurement.
+# This is accomplished by first running with an increasing number of runs
+# per loop until the runtime is much larger than measurement overhead
+# (which also serves as a warm up), and then taking measurements until
+# the target time is reached. This has the useful properties that it wastes
+# less data and allows us to compute statistics to estimate the reliability
+# of the measurements.
+#
+
+m0 = t0.blocked_autorange()
+m1 = t1.blocked_autorange()
+
+print(m0)
+print(m1)
+
+######################################################################
+# .. code-block:: none
+#    :caption: Output
+#
+#     <torch.utils.benchmark.utils.common.Measurement object at 0x7fb10400d0f0>
+#     batched_dot_mul_sum(x, x)
+#     setup: from __main__ import batched_dot_mul_sum
+#       231.79 us
+#       1 measurement, 1000 runs , 1 thread
+#     <torch.utils.benchmark.utils.common.Measurement object at 0x7fb10400d080>
+#     batched_dot_bmm(x, x)
+#     setup: from __main__ import batched_dot_bmm
+#       Median: 162.08 us
+#       2 measurements, 1000 runs per measurement, 1 thread
+#
+
+######################################################################
+# We can also inspect the individual statistics from the returned
+# measurements object.
+
+print(f"Mean:   {m0.mean * 1e6:6.2f} us")
+print(f"Median: {m0.median * 1e6:6.2f} us")
+
+######################################################################
+# .. code-block:: none
+#    :caption: Output
+#
+#     Mean:   231.79 us
+#     Median: 231.79 us
+#
+
+######################################################################
+# 5. Comparing benchmark results
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# So far we've been comparing our two versions of batched dot against a
+# single input. In practice, we want to try a combination of inputs as
+# well as different number of threads. The ``Compare`` class helps display
+# the results of many measurements in a formatted table. It uses the
+# annotations described above (`label`, `sub_label`, `num_threads`, etc.) as
+# well as `description` to group and organize the table. Let's use
+# ``Compare`` to see how our functions perform for different input sizes
+# and number of threads.
+#
+
+from itertools import product
+
+# Compare takes a list of measurements which we'll save in results.
+results = []
+
+sizes = [1, 64, 1024, 10000]
+for b, n in product(sizes, sizes):
+    # label and sub_label are the rows
+    # description is the column
+    label = 'Batched dot'
+    sub_label = f'[{b}, {n}]'
+    x = torch.ones((b, n))
+    for num_threads in [1, 4, 16, 32]:
+        results.append(benchmark.Timer(
+            stmt='batched_dot_mul_sum(x, x)',
+            setup='from __main__ import batched_dot_mul_sum',
+            globals={'x': x},
+            num_threads=num_threads,
+            label=label,
+            sub_label=sub_label,
+            description='mul/sum',
+        ).blocked_autorange(min_run_time=1))
+        results.append(benchmark.Timer(
+            stmt='batched_dot_bmm(x, x)',
+            setup='from __main__ import batched_dot_bmm',
+            globals={'x': x},
+            num_threads=num_threads,
+            label=label,
+            sub_label=sub_label,
+            description='bmm',
+        ).blocked_autorange(min_run_time=1))
+
+compare = benchmark.Compare(results)
+compare.print()
+
+######################################################################
+# .. code-block:: none
+#    :caption: Output
+#
+#     [--------------- Batched dot ----------------]
+#                           |  mul/sum   |    bmm   
+#     1 threads: -----------------------------------
+#           [1, 1]          |       5.9  |      11.2
+#           [1, 64]         |       6.4  |      11.4
+#           [1, 1024]       |       6.7  |      14.2
+#           [1, 10000]      |      10.2  |      23.7
+#           [64, 1]         |       6.3  |      11.5
+#           [64, 64]        |       8.6  |      15.4
+#           [64, 1024]      |      39.4  |     204.4
+#           [64, 10000]     |     274.9  |     748.5
+#           [1024, 1]       |       7.7  |      17.8
+#           [1024, 64]      |      40.3  |      76.4
+#           [1024, 1024]    |     432.4  |    2795.9
+#           [1024, 10000]   |   22657.3  |   11899.5
+#           [10000, 1]      |      16.9  |      74.8
+#           [10000, 64]     |     300.3  |     609.4
+#           [10000, 1024]   |   23098.6  |   27246.1
+#           [10000, 10000]  |  267073.7  |  118823.7
+#     4 threads: -----------------------------------
+#           [1, 1]          |       6.0  |      11.5
+#           [1, 64]         |       6.2  |      11.2
+#           [1, 1024]       |       6.8  |      14.3
+#           [1, 10000]      |      10.2  |      23.7
+#           [64, 1]         |       6.3  |      16.2
+#           [64, 64]        |       8.8  |      18.2
+#           [64, 1024]      |      41.5  |     189.1
+#           [64, 10000]     |      91.7  |     849.1
+#           [1024, 1]       |       7.6  |      17.4
+#           [1024, 64]      |      43.5  |      33.5
+#           [1024, 1024]    |     135.4  |    2782.3
+#           [1024, 10000]   |    7471.1  |   11874.0
+#           [10000, 1]      |      16.8  |      33.9
+#           [10000, 64]     |     118.7  |     173.2
+#           [10000, 1024]   |    7264.6  |   27824.7
+#           [10000, 10000]  |  100060.9  |  121499.0
+#     16 threads: ----------------------------------
+#           [1, 1]          |       6.0  |      11.3
+#           [1, 64]         |       6.2  |      11.2
+#           [1, 1024]       |       6.9  |      14.2
+#           [1, 10000]      |      10.3  |      23.8
+#           [64, 1]         |       6.4  |      24.1
+#           [64, 64]        |       9.0  |      23.8
+#           [64, 1024]      |      54.1  |     188.5
+#           [64, 10000]     |      49.9  |     748.0
+#           [1024, 1]       |       7.6  |      23.4
+#           [1024, 64]      |      55.5  |      28.2
+#           [1024, 1024]    |      66.9  |    2773.9
+#           [1024, 10000]   |    6111.5  |   12833.7
+#           [10000, 1]      |      16.9  |      27.5
+#           [10000, 64]     |      59.5  |      73.7
+#           [10000, 1024]   |    6295.9  |   27062.0
+#           [10000, 10000]  |   71804.5  |  120365.8
+#     32 threads: ----------------------------------
+#           [1, 1]          |       5.9  |      11.3
+#           [1, 64]         |       6.2  |      11.3
+#           [1, 1024]       |       6.7  |      14.2
+#           [1, 10000]      |      10.5  |      23.8
+#           [64, 1]         |       6.3  |      31.7
+#           [64, 64]        |       9.1  |      30.4
+#           [64, 1024]      |      72.0  |     190.4
+#           [64, 10000]     |     103.1  |     746.9
+#           [1024, 1]       |       7.6  |      28.4
+#           [1024, 64]      |      70.5  |      31.9
+#           [1024, 1024]    |      65.6  |    2804.6
+#           [1024, 10000]   |    6764.0  |   11871.4
+#           [10000, 1]      |      17.8  |      31.8
+#           [10000, 64]     |     110.3  |      56.0
+#           [10000, 1024]   |    6640.2  |   27592.2
+#           [10000, 10000]  |   73003.4  |  120083.2
+#
+#     Times are in microseconds (us).
+#
+
+######################################################################
+# The results above indicate that the version which reduces to ``bmm``
+# is better for larger tensors running on multiple threads, while for
+# smaller and/or single thread code, the other version is better.
+#
+# ``Compare`` also provides functions for changing the table format
+#
+
+compare.trim_significant_figures()
+compare.colorize()
+compare.print()
+
+
+######################################################################
+# 6. Saving/Loading benchmark results
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# `Measurements` (and ``CallgrindStats`` which are described in section 8)
+# can be serialized by the ``pickle`` module. This makes A/B testing easy, as you can collect
+# measurements from two separate environments, pickle them, and then
+# load both in a single environment. Timer even takes an `env`
+# constructor argument so that such A/B testing works seamlessly.
+#
+# Let's imagine that rather than two Python functions, the add/sum
+# and ``bmm`` approaches were in two different builds of PyTorch.
+# The example below demonstrates how one might A/B test them. For
+# simplicity, we only use a subset of shapes, and simply round trip
+# results through pickle rather than actually using multiple environments
+# and writing results to disk.
+#
+
+import pickle
+
+ab_test_results = []
+for env in ('environment A: mul/sum', 'environment B: bmm'):
+    for b, n in ((1, 1), (1024, 10000), (10000, 1)):
+        x = torch.ones((b, n))
+        dot_fn = (batched_dot_mul_sum if env == 'environment A: mul/sum' else batched_dot_bmm)
+        m = benchmark.Timer(
+            stmt='batched_dot(x, x)',
+            globals={'x': x, 'batched_dot': dot_fn},
+            num_threads=1,
+            label='Batched dot',
+            description=f'[{b}, {n}]',
+            env=env,
+        ).blocked_autorange(min_run_time=1)
+        ab_test_results.append(pickle.dumps(m))
+
+ab_results = [pickle.loads(i) for i in ab_test_results]
+compare = benchmark.Compare(ab_results)
+compare.trim_significant_figures()
+compare.colorize()
+compare.print()
+
+######################################################################
+# .. code-block:: none
+#    :caption: Output
+#
+#     [------------------------------------- Batched dot -------------------------------------]
+#                                                    |  [1, 1]  |  [1024, 10000]  |  [10000, 1]
+#     1 threads: ------------------------------------------------------------------------------
+#       (environment A: mul/sum)  batched_dot(x, x)  |     7    |      36000      |      21
+#       (environment B: bmm)      batched_dot(x, x)  |    14    |      40000      |      85
+#
+#     Times are in microseconds (us).
+#
+
+# And just to show that we can round trip all of the results from earlier:
+round_tripped_results = pickle.loads(pickle.dumps(results))
+assert(str(benchmark.Compare(results)) == str(benchmark.Compare(round_tripped_results)))
+
+
+######################################################################
+# 7. Generating inputs with `Fuzzed Parameters`
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# As we've seen in the previous section, there can be some stark
+# performance differences depending on the input tensors. Hence, it
+# is a good idea to run benchmarks on a number of different inputs.
+# However, creating all these input tensors can be tedious which is
+# where ``torch.utils.benchmark.Fuzzer`` and related classes come in.
+# Let's take a look at how we can use the ``Fuzzer`` to create some test
+# cases for the benchmark.
+#
+
+from torch.utils.benchmark import Fuzzer, FuzzedParameter, FuzzedTensor, ParameterAlias
+
+# Generates random tensors with 128 to 10000000 elements and sizes k0 and k1 chosen from a
+# ``loguniform`` distribution in [1, 10000], 40% of which will be discontiguous on average.
+example_fuzzer = Fuzzer(
+    parameters = [
+        FuzzedParameter('k0', minval=1, maxval=10000, distribution='loguniform'),
+        FuzzedParameter('k1', minval=1, maxval=10000, distribution='loguniform'),
+    ],
+    tensors = [
+        FuzzedTensor('x', size=('k0', 'k1'), min_elements=128, max_elements=10000000, probability_contiguous=0.6)
+    ],
+    seed=0,
+)
+
+results = []
+for tensors, tensor_params, params in example_fuzzer.take(10):
+    # description is the column label
+    sub_label=f"{params['k0']:<6} x {params['k1']:<4} {'' if tensor_params['x']['is_contiguous'] else '(discontiguous)'}"
+    results.append(benchmark.Timer(
+        stmt='batched_dot_mul_sum(x, x)',
+        setup='from __main__ import batched_dot_mul_sum',
+        globals=tensors,
+        label='Batched dot',
+        sub_label=sub_label,
+        description='mul/sum',
+    ).blocked_autorange(min_run_time=1))
+    results.append(benchmark.Timer(
+        stmt='batched_dot_bmm(x, x)',
+        setup='from __main__ import batched_dot_bmm',
+        globals=tensors,
+        label='Batched dot',
+        sub_label=sub_label,
+        description='bmm',
+    ).blocked_autorange(min_run_time=1))
+
+compare = benchmark.Compare(results)
+compare.trim_significant_figures()
+compare.print()
+
+######################################################################
+# .. code-block:: none
+#    :caption: Output
+#
+#     [--------------------- Batched dot ---------------------]
+#                                          |  mul/sum  |   bmm 
+#     1 threads: ----------------------------------------------
+#           725    x 257                   |      87   |    180
+#           49     x 383                   |      15   |     30
+#           34     x 1468                  |      30   |    118
+#           187    x 5039                  |     400   |   1200
+#           2140   x 1296 (discontiguous)  |    2000   |  41000
+#           78     x 1598                  |      74   |    310
+#           519    x 763                   |     190   |   1500
+#           141    x 1082                  |      87   |    500
+#           78     x 5    (discontiguous)  |       9   |     20
+#           187    x 1                     |      12   |     10
+#
+#     Times are in microseconds (us). 
+#
+
+######################################################################
+# There is a lot of flexibility for defining your own ``fuzzers`` which
+# is great for creating a powerful set of inputs to benchmark. But to
+# make things even simpler, PyTorch benchmark module comes with some
+# built-in ``fuzzers`` for common benchmarking needs. Let's take a look at
+# how we can use one of these built-in ``fuzzers``.
+#
+
+from torch.utils.benchmark.op_fuzzers import binary
+
+results = []
+for tensors, tensor_params, params in binary.BinaryOpFuzzer(seed=0).take(10):
+    sub_label=f"{params['k0']:<6} x {params['k1']:<4} {'' if tensor_params['x']['is_contiguous'] else '(discontiguous)'}"
+    results.append(benchmark.Timer(
+        stmt='batched_dot_mul_sum(x, x)',
+        setup='from __main__ import batched_dot_mul_sum',
+        globals=tensors,
+        label='Batched dot',
+        sub_label=sub_label,
+        description='mul/sum',
+    ).blocked_autorange(min_run_time=1))
+    results.append(benchmark.Timer(
+        stmt='batched_dot_bmm(x, x)',
+        setup='from __main__ import batched_dot_bmm',
+        globals=tensors,
+        label='Batched dot',
+        sub_label=sub_label,
+        description='bmm',
+    ).blocked_autorange(min_run_time=1))
+
+compare = benchmark.Compare(results)
+compare.trim_significant_figures()
+compare.colorize(rowwise=True)
+compare.print()
+
+######################################################################
+# .. code-block:: none
+#    :caption: Output
+#
+#     [----------------------- Batched dot ------------------------]
+#                                              |  mul/sum  |   bmm  
+#     1 threads: ---------------------------------------------------
+#           64     x 473  (discontiguous)      |    10000  |   40000
+#           16384  x 12642115 (discontiguous)  |       31  |      78
+#           8192   x 892                       |     4800  |   20400
+#           512    x 64   (discontiguous)      |   110000  |  400000
+#           493    x 27   (discontiguous)      |     1100  |    2440
+#           118    x 32   (discontiguous)      |      870  |    2030
+#           16     x 495  (discontiguous)      |    23600  |   24000
+#           488    x 62374                     |    90000  |  100000
+#           240372 x 69                        |    40000  |   16000
+#           40156  x 32   (discontiguous)      |     2670  |    5000
+#    
+#     Times are in microseconds (us).
+#
+
+######################################################################
+# 8. Collecting instruction counts with ``Callgrind``
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# One of the challenges of optimizing code is the variation and opacity of
+# wall time. There are many sources of non-determinism, from adaptive clock
+# speeds to resource contention with other processes. Furthermore, end-to-end
+# time gives no insight into where time is being spent, which is really what
+# we're interested in when optimizing code.
+#
+# A complementary approach is to also collect instruction counts. These counts
+# are a proxy metric and do not capture all aspects of performance
+# (e.g. memory or I/O bound tasks), however they do have several useful
+# properties. Instruction counts are reproducible, insensitive to environmental
+# variation, and offer fine grained insight into where a program is spending
+# cycles.
+#
+# To see the utility of instruction counts, let us look at how we might
+# reduce the overhead of `batched_dot_mul_sum`. The obvious solution is to
+# move it to C++, so we avoid going between Python and C++ multiple times.
+#
+# Fortunately, the source is nearly identical. One question that we have to ask
+# in C++ is whether we should take arguments by value or reference.
+#
+
+batched_dot_src = """\
+/* ---- Python ---- */
+// def batched_dot_mul_sum(a, b):
+//     return a.mul(b).sum(-1)
+
+torch::Tensor batched_dot_mul_sum_v0(
+    const torch::Tensor a,
+    const torch::Tensor b) {
+  return a.mul(b).sum(-1);
+}
+
+torch::Tensor batched_dot_mul_sum_v1(
+    const torch::Tensor& a,
+    const torch::Tensor& b) {
+  return a.mul(b).sum(-1);
+}
+"""
+
+
+# PyTorch makes it easy to test our C++ implementations by providing a utility
+# to JIT compile C++ source into Python extensions:
+import os
+from torch.utils import cpp_extension
+cpp_lib = cpp_extension.load_inline(
+    name='cpp_lib',
+    cpp_sources=batched_dot_src,
+    extra_cflags=['-O3'],
+    extra_include_paths=[
+        # `load_inline` needs to know where to find ``pybind11`` headers.
+        os.path.join(os.getenv('CONDA_PREFIX'), 'include')
+    ],
+    functions=['batched_dot_mul_sum_v0', 'batched_dot_mul_sum_v1']
+)
+
+# `load_inline` will create a shared object that is loaded into Python. When we collect
+# instruction counts Timer will create a subprocess, so we need to re-import it. The
+# import process is slightly more complicated for C extensions, but that's all we're
+# doing here.
+module_import_str = f"""\
+# https://stackoverflow.com/questions/67631/how-to-import-a-module-given-the-full-path
+import importlib.util
+spec = importlib.util.spec_from_file_location("cpp_lib", {repr(cpp_lib.__file__)})
+cpp_lib = importlib.util.module_from_spec(spec)
+spec.loader.exec_module(cpp_lib)"""
+
+import textwrap
+def pretty_print(result):
+    """Import machinery for ``cpp_lib.so`` can get repetitive to look at."""
+    print(repr(result).replace(textwrap.indent(module_import_str, "  "), "  import cpp_lib"))
+
+
+t_baseline = benchmark.Timer(
+    stmt='batched_dot_mul_sum(x, x)',
+    setup='''\
+from __main__ import batched_dot_mul_sum
+x = torch.randn(2, 2)''')
+
+t0 = benchmark.Timer(
+    stmt='cpp_lib.batched_dot_mul_sum_v0(x, x)',
+    setup=f'''\
+{module_import_str}
+x = torch.randn(2, 2)''')
+
+t1 = benchmark.Timer(
+    stmt='cpp_lib.batched_dot_mul_sum_v1(x, x)',
+    setup=f'''\
+{module_import_str}
+x = torch.randn(2, 2)''')
+
+# Moving to C++ did indeed reduce overhead, but it's hard to tell which
+# calling convention is more efficient. v1 (call with references) seems to
+# be a bit faster, but it's within measurement error.
+pretty_print(t_baseline.blocked_autorange())
+pretty_print(t0.blocked_autorange())
+pretty_print(t1.blocked_autorange())
+
+######################################################################
+# .. code-block:: none
+#    :caption: Output
+#
+#     <torch.utils.benchmark.utils.common.Measurement object at 0x7fb16935d2e8>
+#     batched_dot_mul_sum(x, x)
+#     setup:
+#       from __main__ import batched_dot_mul_sum
+#       x = torch.randn(2, 2)
+#    
+#       6.92 us
+#       1 measurement, 100000 runs , 1 thread
+#     <torch.utils.benchmark.utils.common.Measurement object at 0x7fb16935d2e8>
+#     cpp_lib.batched_dot_mul_sum_v0(x, x)
+#     setup:
+#       import cpp_lib
+#       x = torch.randn(2, 2)
+#    
+#       5.29 us
+#       1 measurement, 100000 runs , 1 thread
+#     <torch.utils.benchmark.utils.common.Measurement object at 0x7fb16935d2e8>
+#     cpp_lib.batched_dot_mul_sum_v1(x, x)
+#     setup:
+#       import cpp_lib
+#       x = torch.randn(2, 2)
+#    
+#       5.22 us
+#       1 measurement, 100000 runs , 1 thread
+#
+
+# Let's use ``Callgrind`` to determine which is better.
+stats_v0 = t0.collect_callgrind()
+stats_v1 = t1.collect_callgrind()
+
+pretty_print(stats_v0)
+pretty_print(stats_v1)
+
+# `.as_standardized` removes file names and some path prefixes, and makes
+# it easier to read the function symbols.
+stats_v0 = stats_v0.as_standardized()
+stats_v1 = stats_v1.as_standardized()
+
+# `.delta` diffs the instruction counts, and `.denoise` removes several
+# functions in the Python interpreter that are known to have significant
+# jitter.
+delta = stats_v1.delta(stats_v0).denoise()
+
+# `.transform` is a convenience API for transforming function names. It is
+# useful for increasing cancelation when ``diff-ing`` instructions, as well as
+# just generally improving readability.
+replacements = (
+    ("???:void pybind11", "pybind11"),
+    ("batched_dot_mul_sum_v0", "batched_dot_mul_sum_v1"),
+    ("at::Tensor, at::Tensor", "..."),
+    ("at::Tensor const&, at::Tensor const&", "..."),
+    ("auto torch::detail::wrap_pybind_function_impl_", "wrap_pybind_function_impl_"),
+)
+for before, after in replacements:
+    delta = delta.transform(lambda l: l.replace(before, after))
+
+# We can use print options to control how much of the function to display.
+torch.set_printoptions(linewidth=160)
+
+# Once parsed, the instruction counts make clear that passing `a` and `b`
+# by reference is more efficient as it skips some ``c10::TensorImpl`` bookkeeping
+# for the intermediate Tensors, and is also works better with ``pybind11``. This
+# is consistent with our noisy wall time observations.
+print(delta)
+
+######################################################################
+# .. code-block::
+#
+#     <torch.utils.benchmark.utils.valgrind_wrapper.timer_interface.CallgrindStats object at 0x7fb0f06e7630>
+#     cpp_lib.batched_dot_mul_sum_v0(x, x)
+#     setup:
+#       import cpp_lib
+#       x = torch.randn(2, 2)
+#                                All          Noisy symbols removed
+#         Instructions:      2392671                    2392671
+#         Baseline:             4367                       4367
+#     100 runs per measurement, 1 thread
+#     Warning: PyTorch was not built with debug symbols.
+#              Source information may be limited. Rebuild with
+#              REL_WITH_DEB_INFO=1 for more detailed results.
+#     <torch.utils.benchmark.utils.valgrind_wrapper.timer_interface.CallgrindStats object at 0x7fb10400d208>
+#     cpp_lib.batched_dot_mul_sum_v1(x, x)
+#     setup:
+#       import cpp_lib
+#       x = torch.randn(2, 2)
+#                                All          Noisy symbols removed
+#         Instructions:      2378978                    2378978
+#         Baseline:             4367                       4367
+#         100 runs per measurement, 1 thread
+#         Warning: PyTorch was not built with debug symbols.
+#                  Source information may be limited. Rebuild with
+#                  REL_WITH_DEB_INFO=1 for more detailed results.
+#         <torch.utils.benchmark.utils.valgrind_wrapper.timer_interface.FunctionCounts object at 0x7fb1000ab358>
+#               86  ???:0x000000000020d9e0
+#           56  ???:0x000000000020db10
+#        -1100  pybind11::cpp_function::initialize<wrap_pybind_function_impl_<at::Tensor ... r (&)(...), std::integer_sequence<unsigned long, 0ul, 1ul>)::{lambda(...)
+#        -1600  ???:wrap_pybind_function_impl_<at::Tensor (&)(...), 0ul, 1ul>(at::Tensor (&)(...), std::integer_sequence<unsigned long, 0ul, 1ul>)::{lambda(...)
+#        -5200  ???:c10::intrusive_ptr<c10::TensorImpl, c10::UndefinedTensorImpl>::reset_()
+#        -5935  ???:0x000000000022c0e0
+#     Total: -13693
+#
+
+
+######################################################################
+# Learn More
+# ----------
+#
+# Take a look at these other recipes to continue your learning:
+#
+# -  `PyTorch Profiler <https://docs.pytorch.org/tutorials/recipes/recipes/profiler_recipe.html>`_
+#
diff --git a/recipes_source/recipes/changing_default_device.py b/recipes_source/recipes/changing_default_device.py
new file mode 100644
index 00000000000..103560fd743
--- /dev/null
+++ b/recipes_source/recipes/changing_default_device.py
@@ -0,0 +1,50 @@
+"""
+Changing default device
+=======================
+
+It is common practice to write PyTorch code in a device-agnostic way,
+and then switch between CPU and CUDA depending on what hardware is available.
+Typically, to do this you might have used if-statements and ``cuda()`` calls
+to do this:
+
+.. note::
+   This recipe requires PyTorch 2.0.0 or later.
+
+"""
+import torch
+
+USE_CUDA = False
+
+mod = torch.nn.Linear(20, 30)
+if USE_CUDA:
+    mod.cuda()
+
+device = 'cpu'
+if USE_CUDA:
+    device = 'cuda'
+inp = torch.randn(128, 20, device=device)
+print(mod(inp).device)
+
+###################################################################
+# PyTorch now also has a context manager which can take care of the
+# device transfer automatically. Here is an example:
+
+with torch.device('cuda'):
+    mod = torch.nn.Linear(20, 30)
+    print(mod.weight.device)
+    print(mod(torch.randn(128, 20)).device)
+
+#########################################
+# You can also set it globally like this: 
+
+torch.set_default_device('cuda')
+
+mod = torch.nn.Linear(20, 30)
+print(mod.weight.device)
+print(mod(torch.randn(128, 20)).device)
+
+################################################################
+# This function imposes a slight performance cost on every Python
+# call to the torch API (not just factory functions). If this
+# is causing problems for you, please comment on
+# `this issue <https://github.com/pytorch/pytorch/issues/92701>`__
diff --git a/recipes_source/recipes/defining_a_neural_network.py b/recipes_source/recipes/defining_a_neural_network.py
new file mode 100644
index 00000000000..da58a1c5752
--- /dev/null
+++ b/recipes_source/recipes/defining_a_neural_network.py
@@ -0,0 +1,183 @@
+"""
+Defining a Neural Network in PyTorch
+====================================
+Deep learning uses artificial neural networks (models), which are
+computing systems that are composed of many layers of interconnected
+units. By passing data through these interconnected units, a neural
+network is able to learn how to approximate the computations required to
+transform inputs into outputs. In PyTorch, neural networks can be
+constructed using the ``torch.nn`` package.
+
+Introduction
+------------
+PyTorch provides the elegantly designed modules and classes, including
+``torch.nn``, to help you create and train neural networks. An
+``nn.Module`` contains layers, and a method ``forward(input)`` that
+returns the ``output``.
+
+In this recipe, we will use ``torch.nn`` to define a neural network
+intended for the `MNIST
+dataset <hhttps://pytorch.org/vision/stable/generated/torchvision.datasets.MNIST.html#torchvision.datasets.MNIST>`__.
+
+Setup
+-----
+Before we begin, we need to install ``torch`` if it isn’t already
+available.
+
+::
+
+   pip install torch
+
+
+"""
+
+
+######################################################################
+# Steps
+# -----
+# 
+# 1. Import all necessary libraries for loading our data
+# 2. Define and initialize the neural network
+# 3. Specify how data will pass through your model
+# 4. [Optional] Pass data through your model to test
+# 
+# 1. Import necessary libraries for loading our data
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# 
+# For this recipe, we will use ``torch`` and its subsidiaries ``torch.nn``
+# and ``torch.nn.functional``.
+# 
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+######################################################################
+# 2. Define and initialize the neural network
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# 
+# Our network will recognize images. We will use a process built into
+# PyTorch called convolution. Convolution adds each element of an image to
+# its local neighbors, weighted by a kernel, or a small matrix, that
+# helps us extract certain features (like edge detection, sharpness,
+# blurriness, etc.) from the input image.
+# 
+# There are two requirements for defining the ``Net`` class of your model.
+# The first is writing an __init__ function that references
+# ``nn.Module``. This function is where you define the fully connected
+# layers in your neural network.
+# 
+# Using convolution, we will define our model to take 1 input image
+# channel, and output match our target of 10 labels representing numbers 0
+# through 9. This algorithm is yours to create, we will follow a standard
+# MNIST algorithm.
+# 
+
+class Net(nn.Module):
+    def __init__(self):
+      super(Net, self).__init__()
+
+      # First 2D convolutional layer, taking in 1 input channel (image),
+      # outputting 32 convolutional features, with a square kernel size of 3
+      self.conv1 = nn.Conv2d(1, 32, 3, 1)
+      # Second 2D convolutional layer, taking in the 32 input layers,
+      # outputting 64 convolutional features, with a square kernel size of 3
+      self.conv2 = nn.Conv2d(32, 64, 3, 1)
+
+      # Designed to ensure that adjacent pixels are either all 0s or all active
+      # with an input probability
+      self.dropout1 = nn.Dropout2d(0.25)
+      self.dropout2 = nn.Dropout2d(0.5)
+
+      # First fully connected layer
+      self.fc1 = nn.Linear(9216, 128)
+      # Second fully connected layer that outputs our 10 labels
+      self.fc2 = nn.Linear(128, 10)
+
+my_nn = Net()
+print(my_nn)
+
+
+######################################################################
+# We have finished defining our neural network, now we have to define how
+# our data will pass through it.
+# 
+# 3. Specify how data will pass through your model
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# 
+# When you use PyTorch to build a model, you just have to define the
+# ``forward`` function, that will pass the data into the computation graph
+# (i.e. our neural network). This will represent our feed-forward
+# algorithm.
+# 
+# You can use any of the Tensor operations in the ``forward`` function.
+# 
+
+class Net(nn.Module):
+    def __init__(self):
+      super(Net, self).__init__()
+      self.conv1 = nn.Conv2d(1, 32, 3, 1)
+      self.conv2 = nn.Conv2d(32, 64, 3, 1)
+      self.dropout1 = nn.Dropout2d(0.25)
+      self.dropout2 = nn.Dropout2d(0.5)
+      self.fc1 = nn.Linear(9216, 128)
+      self.fc2 = nn.Linear(128, 10)
+
+    # x represents our data
+    def forward(self, x):
+      # Pass data through conv1
+      x = self.conv1(x)
+      # Use the rectified-linear activation function over x
+      x = F.relu(x)
+
+      x = self.conv2(x)
+      x = F.relu(x)
+
+      # Run max pooling over x
+      x = F.max_pool2d(x, 2)
+      # Pass data through dropout1
+      x = self.dropout1(x)
+      # Flatten x with start_dim=1
+      x = torch.flatten(x, 1)
+      # Pass data through ``fc1``
+      x = self.fc1(x)
+      x = F.relu(x)
+      x = self.dropout2(x)
+      x = self.fc2(x)
+
+      # Apply softmax to x 
+      output = F.log_softmax(x, dim=1)
+      return output
+
+
+######################################################################
+# 4. [Optional] Pass data through your model to test
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# 
+# To ensure we receive our desired output, let’s test our model by passing
+# some random data through it.
+# 
+
+# Equates to one random 28x28 image
+random_data = torch.rand((1, 1, 28, 28))
+
+my_nn = Net()
+result = my_nn(random_data)
+print (result)
+
+
+######################################################################
+# Each number in this resulting tensor equates to the prediction of the
+# label the random tensor is associated to.
+# 
+# Congratulations! You have successfully defined a neural network in
+# PyTorch.
+# 
+# Learn More
+# ----------
+# 
+# Take a look at these other recipes to continue your learning:
+# 
+# - `What is a state_dict in PyTorch <https://pytorch.org/tutorials/recipes/recipes/what_is_state_dict.html>`__
+# - `Saving and loading models for inference in PyTorch <https://pytorch.org/tutorials/recipes/recipes/saving_and_loading_models_for_inference.html>`__
diff --git a/recipes_source/recipes/module_load_state_dict_tips.py b/recipes_source/recipes/module_load_state_dict_tips.py
new file mode 100644
index 00000000000..70e9830cb3c
--- /dev/null
+++ b/recipes_source/recipes/module_load_state_dict_tips.py
@@ -0,0 +1,172 @@
+"""
+
+Tips for Loading an ``nn.Module`` from a Checkpoint
+===================================================
+**Author:** `Mikayla Gawarecki <https://github.com/mikaylagawarecki>`_
+
+If you're loading a checkpoint and want to reduce compute and memory as much as possible,
+this tutorial shares some recommended practices. In particular, we will discuss
+
+1.  The ``mmap`` keyword argument on ``torch.load``
+2.  The ``torch.device()`` context manager
+3.  The ``assign`` keyword argument on ``nn.Module.load_state_dict()``
+
+.. note::
+   This recipe requires PyTorch 2.1.0 or later.
+"""
+
+
+###############################################################################
+# Let us consider a simple ``nn.Module`` that contains a list of Linear layers:
+import torch
+from torch import nn
+import time
+
+class SomeModule(torch.nn.Module):
+    def __init__(self, size):
+        super().__init__()
+        self.linears = nn.ModuleList([nn.Linear(size, size) for i in range(10)])
+
+    def forward(self, x):
+        return self.linears(x)
+
+
+m = SomeModule(1000)
+torch.save(m.state_dict(), 'checkpoint.pth')
+
+#################################################################################
+# The following snippet demonstrates the use of the the ``mmap`` keyword argument
+# to ``torch.load``, the ``torch.device()`` context manager and the ``assign``
+# keyword argument to ``nn.Module.load_state_dict()``.
+
+state_dict = torch.load('checkpoint.pth', mmap=True, weights_only=True)
+with torch.device('meta'):
+  meta_m = SomeModule(1000)
+meta_m.load_state_dict(state_dict, assign=True)
+
+#############################################################################
+# Compare the snippet below to the one above:
+
+state_dict = torch.load('checkpoint.pth', weights_only=True)
+m = SomeModule(1000)
+m.load_state_dict(state_dict)
+
+#############################################################################
+# The second example does not use any of the features listed above and will be
+# less compute and memory efficient for loading a checkpoint. In the following
+# sections, we will discuss each of the features in further detail.
+
+#####################################################################################
+# Using ``torch.load(mmap=True)``
+# -------------------------------
+# First, let us consider what happens when we load the checkpoint with ``torch.load``.
+# When we save a checkpoint with ``torch.save``, tensor storages are tagged with the device they are
+# saved on. With ``torch.load``, tensor storages will be loaded to the device
+# they were tagged with (unless this behavior is overridden using the
+# ``map_location`` flag). For ease of explanation, let us assume that the tensors
+# were saved on CPU. This means that on the first line all tensor storages will be
+# loaded into CPU RAM, which can be undesirable when:
+#
+# * CPU RAM is smaller than the size of the checkpoint.
+# * Waiting for the entire checkpoint to be loaded into RAM before performing, for example, some per-tensor processing.
+
+start_time = time.time()
+state_dict = torch.load('checkpoint.pth', weights_only=True)
+end_time = time.time()
+print(f"loading time without mmap={end_time - start_time}")
+
+#################################################################################
+# The ``mmap`` keyword argument to ``torch.load`` attempts to solve the above two
+# problems. As its name implies, the ``mmap`` keyword argument to ``torch.load``
+# makes use of an `mmap call <https://man7.org/linux/man-pages/man2/mmap.2.html>`_
+# which maps a file on disk into virtual memory and lets the OS handle loading and
+# unloading into physical memory automatically. When this flag is passed, tensor
+# storages will be memory-mapped.
+
+start_time = time.time()
+state_dict = torch.load('checkpoint.pth', mmap=True, weights_only=True)
+end_time = time.time()
+print(f"loading time with mmap={end_time - start_time}")
+
+######################################################################################
+# As mentioned above, one can use this argument to do per-tensor processing on a
+# checkpoint without loading all tensor storages into CPU memory upfront. For example:
+def my_special_routine(t, device):
+    # this could be a much fancier operation
+    return t.to(dtype=torch.bfloat16, device=device)
+
+def my_processing_function(key, device):
+    t = state_dict[key]
+    processed_t = my_special_routine(t, device)
+    del t
+    state_dict[key] = processed_t
+
+for key in state_dict.keys():
+    device = torch.device('cuda')
+    my_processing_function(key, device)
+
+##################################################
+# Using ``torch.device('meta')``
+# ------------------------------
+# Next, let's consider the creation of the module.
+m = SomeModule(1000)
+
+#######################################################################################################
+# This allocates memory for all parameters/buffers and initializes them per
+# the default initialization schemes defined in ``SomeModule.__init__()``, which
+# is wasteful when we want to load a checkpoint for the following reasons:
+#
+# * The result of the initialization kernels will be overwritten by ``load_state_dict()`` without ever being used, so
+#   initialization is wasteful.
+# * We are allocating memory for these parameters/buffers in RAM while ``torch.load`` of the saved state dictionary also
+#   allocates memory in RAM for the parameters/buffers in the checkpoint.
+#
+# In order to solve these two problems, we can use the ``torch.device()``
+# context manager with ``device='meta'`` when we instantiate the ``nn.Module()``.
+#
+# The `torch.device() <https://pytorch.org/docs/main/tensor_attributes.html#torch-device>`_
+# context manager makes sure that factory calls will be performed as if they
+# were passed the specified ``device`` as an argument. Tensors on ``torch.device('meta')`` do not
+# carry data. However, they possess all other metadata a tensor carries such as ``.size()``, ``.stride()``,
+# ``.requires_grad``, and others.
+with torch.device('meta'):
+  new_m = SomeModule(1000)
+
+########################################################
+# Using ``load_state_dict(assign=True)``
+# --------------------------------------
+# Next, we consider the loading of the state dictionary.
+
+m.load_state_dict(state_dict)
+
+######################################################################################
+# ``nn.Module.load_state_dict()`` is usually implemented via an in-place
+# ``param_in_model.copy_(param_in_state_dict)``. This means that the parameter/buffer
+# with the corresponding key in the state dictionary is copied into the
+# parameter/buffer in the ``nn.Module``.
+#
+# However, an in-place copy into a tensor on the ``meta`` device is a no-op.
+# In order to avoid this, we can pass the ``assign=True`` keyword argument to
+# ``load_state_dict()``.
+#
+# A caveat here is that since optimizers hold a reference to
+# ``nn.Module.parameters()``, the optimizer must be initialized after the module
+# is loaded from state dict if ``assign=True`` is passed.
+
+# As of PyTorch 2.3.0, one can use ``torch.__future__.set_swap_module_params_on_conversion`` to
+# avoid this caveat. This `recipe <https://pytorch.org/tutorials/recipes/recipes/swap_tensors.html>`_
+# provides more details.
+
+new_m.load_state_dict(state_dict, assign=True)
+# Before 2.3.0, this MUST be done AFTER the load_state_dict with assign.
+# In versions >= 2.3.0, one can consider setting ``torch.__future__.set_swap_module_params_on_conversion``
+opt = torch.optim.SGD(new_m.parameters(), lr=1e-3)
+
+###############################################################################
+# Conclusion
+# -------------
+#
+# To recap, in this tutorial we learned about ``torch.load(mmap=True)``, the
+# ``torch.device()`` context manager with ``device=meta``, and
+# ``nn.Module.load_state_dict(assign=True)`` as well as how these tools could
+# be used to aid when loading a model from a checkpoint.
diff --git a/recipes_source/recipes/profiler_recipe.py b/recipes_source/recipes/profiler_recipe.py
new file mode 100644
index 00000000000..a8d1a4dc6b3
--- /dev/null
+++ b/recipes_source/recipes/profiler_recipe.py
@@ -0,0 +1,483 @@
+"""
+PyTorch Profiler
+====================================
+**Author:** `Shivam Raikundalia <https://github.com/sraikund16>`_
+"""
+
+######################################################################
+# This recipe explains how to use PyTorch profiler and measure the time and
+# memory consumption of the model's operators.
+#
+# Introduction
+# ------------
+# PyTorch includes a simple profiler API that is useful when the user needs
+# to determine the most expensive operators in the model.
+#
+# In this recipe, we will use a simple Resnet model to demonstrate how to
+# use the profiler to analyze model performance.
+#
+# Prerequisites
+# ---------------
+# - ``torch >= 2.3.0``
+#
+# Setup
+# -----
+# To install ``torch`` and ``torchvision`` use the following command:
+#
+# .. code-block:: sh
+#
+#    pip install torch torchvision
+#
+
+######################################################################
+# Steps
+# -----
+#
+# 1. Import all necessary libraries
+# 2. Instantiate a simple Resnet model
+# 3. Using profiler to analyze execution time
+# 4. Using profiler to analyze memory consumption
+# 5. Using tracing functionality
+# 6. Examining stack traces
+# 7. Using profiler to analyze long-running jobs
+#
+# 1. Import all necessary libraries
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# In this recipe we will use ``torch``, ``torchvision.models``
+# and ``profiler`` modules:
+#
+
+import torch
+import torchvision.models as models
+from torch.profiler import profile, ProfilerActivity, record_function
+
+
+######################################################################
+# 2. Instantiate a simple Resnet model
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# Let's create an instance of a Resnet model and prepare an input
+# for it:
+#
+
+model = models.resnet18()
+inputs = torch.randn(5, 3, 224, 224)
+
+######################################################################
+# 3. Using profiler to analyze execution time
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# PyTorch profiler is enabled through the context manager and accepts
+# a number of parameters, some of the most useful are:
+#
+# - ``activities`` - a list of activities to profile:
+#    - ``ProfilerActivity.CPU`` - PyTorch operators, TorchScript functions and
+#      user-defined code labels (see ``record_function`` below);
+#    - ``ProfilerActivity.CUDA`` - on-device CUDA kernels;
+#    - ``ProfilerActivity.XPU`` - on-device XPU kernels;
+# - ``record_shapes`` - whether to record shapes of the operator inputs;
+# - ``profile_memory`` - whether to report amount of memory consumed by
+#   model's Tensors;
+#
+# Note: when using CUDA, profiler also shows the runtime CUDA events
+# occurring on the host.
+
+######################################################################
+# Let's see how we can use profiler to analyze the execution time:
+
+with profile(activities=[ProfilerActivity.CPU], record_shapes=True) as prof:
+    with record_function("model_inference"):
+        model(inputs)
+
+######################################################################
+# Note that we can use ``record_function`` context manager to label
+# arbitrary code ranges with user provided names
+# (``model_inference`` is used as a label in the example above).
+#
+# Profiler allows one to check which operators were called during the
+# execution of a code range wrapped with a profiler context manager.
+# If multiple profiler ranges are active at the same time (e.g. in
+# parallel PyTorch threads), each profiling context manager tracks only
+# the operators of its corresponding range.
+# Profiler also automatically profiles the asynchronous tasks launched
+# with ``torch.jit._fork`` and (in case of a backward pass)
+# the backward pass operators launched with ``backward()`` call.
+#
+# Let's print out the stats for the execution above:
+
+print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10))
+
+######################################################################
+# The output will look like (omitting some columns):
+#
+# .. code-block:: sh
+#
+#    ---------------------------------  ------------  ------------  ------------  ------------
+#                                 Name      Self CPU     CPU total  CPU time avg    # of Calls
+#    ---------------------------------  ------------  ------------  ------------  ------------
+#                      model_inference       5.509ms      57.503ms      57.503ms             1
+#                         aten::conv2d     231.000us      31.931ms       1.597ms            20
+#                    aten::convolution     250.000us      31.700ms       1.585ms            20
+#                   aten::_convolution     336.000us      31.450ms       1.573ms            20
+#             aten::mkldnn_convolution      30.838ms      31.114ms       1.556ms            20
+#                     aten::batch_norm     211.000us      14.693ms     734.650us            20
+#         aten::_batch_norm_impl_index     319.000us      14.482ms     724.100us            20
+#              aten::native_batch_norm       9.229ms      14.109ms     705.450us            20
+#                           aten::mean     332.000us       2.631ms     125.286us            21
+#                         aten::select       1.668ms       2.292ms       8.988us           255
+#    ---------------------------------  ------------  ------------  ------------  ------------
+#    Self CPU time total: 57.549m
+#
+
+######################################################################
+# Here we see that, as expected, most of the time is spent in convolution (and specifically in ``mkldnn_convolution``
+# for PyTorch compiled with ``MKL-DNN`` support).
+# Note the difference between self cpu time and cpu time - operators can call other operators, self cpu time excludes time
+# spent in children operator calls, while total cpu time includes it. You can choose to sort by the self cpu time by passing
+# ``sort_by="self_cpu_time_total"`` into the ``table`` call.
+#
+# To get a finer granularity of results and include operator input shapes, pass ``group_by_input_shape=True``
+# (note: this requires running the profiler with ``record_shapes=True``):
+
+print(
+    prof.key_averages(group_by_input_shape=True).table(
+        sort_by="cpu_time_total", row_limit=10
+    )
+)
+
+########################################################################################
+# The output might look like this (omitting some columns):
+#
+# .. code-block:: sh
+#
+#    ---------------------------------  ------------  -------------------------------------------
+#                                 Name     CPU total                                 Input Shapes
+#    ---------------------------------  ------------  -------------------------------------------
+#                      model_inference      57.503ms                                           []
+#                         aten::conv2d       8.008ms      [5,64,56,56], [64,64,3,3], [], ..., []]
+#                    aten::convolution       7.956ms     [[5,64,56,56], [64,64,3,3], [], ..., []]
+#                   aten::_convolution       7.909ms     [[5,64,56,56], [64,64,3,3], [], ..., []]
+#             aten::mkldnn_convolution       7.834ms     [[5,64,56,56], [64,64,3,3], [], ..., []]
+#                         aten::conv2d       6.332ms    [[5,512,7,7], [512,512,3,3], [], ..., []]
+#                    aten::convolution       6.303ms    [[5,512,7,7], [512,512,3,3], [], ..., []]
+#                   aten::_convolution       6.273ms    [[5,512,7,7], [512,512,3,3], [], ..., []]
+#             aten::mkldnn_convolution       6.233ms    [[5,512,7,7], [512,512,3,3], [], ..., []]
+#                         aten::conv2d       4.751ms  [[5,256,14,14], [256,256,3,3], [], ..., []]
+#    ---------------------------------  ------------  -------------------------------------------
+#    Self CPU time total: 57.549ms
+#
+
+######################################################################
+# Note the occurrence of ``aten::convolution`` twice with different input shapes.
+
+######################################################################
+# Profiler can also be used to analyze performance of models executed on GPUs:
+# Users could switch between cpu, cuda and xpu
+activities = [ProfilerActivity.CPU]
+if torch.cuda.is_available():
+    device = "cuda"
+    activities += [ProfilerActivity.CUDA]
+elif torch.xpu.is_available():
+    device = "xpu"
+    activities += [ProfilerActivity.XPU]
+else:
+    print(
+        "Neither CUDA nor XPU devices are available to demonstrate profiling on acceleration devices"
+    )
+    import sys
+
+    sys.exit(0)
+
+sort_by_keyword = device + "_time_total"
+
+model = models.resnet18().to(device)
+inputs = torch.randn(5, 3, 224, 224).to(device)
+
+with profile(activities=activities, record_shapes=True) as prof:
+    with record_function("model_inference"):
+        model(inputs)
+
+print(prof.key_averages().table(sort_by=sort_by_keyword, row_limit=10))
+
+######################################################################
+# (Note: the first use of CUDA profiling may bring an extra overhead.)
+
+######################################################################
+# The resulting table output (omitting some columns):
+#
+# .. code-block:: sh
+#
+#    -------------------------------------------------------  ------------  ------------
+#                                                       Name     Self CUDA    CUDA total
+#    -------------------------------------------------------  ------------  ------------
+#                                            model_inference       0.000us      11.666ms
+#                                               aten::conv2d       0.000us      10.484ms
+#                                          aten::convolution       0.000us      10.484ms
+#                                         aten::_convolution       0.000us      10.484ms
+#                                 aten::_convolution_nogroup       0.000us      10.484ms
+#                                          aten::thnn_conv2d       0.000us      10.484ms
+#                                  aten::thnn_conv2d_forward      10.484ms      10.484ms
+#    void at::native::im2col_kernel<float>(long, float co...       3.844ms       3.844ms
+#                                          sgemm_32x32x32_NN       3.206ms       3.206ms
+#                                      sgemm_32x32x32_NN_vec       3.093ms       3.093ms
+#    -------------------------------------------------------  ------------  ------------
+#    Self CPU time total: 23.015ms
+#    Self CUDA time total: 11.666ms
+#
+
+######################################################################
+# (Note: the first use of XPU profiling may bring an extra overhead.)
+
+######################################################################
+# The resulting table output (omitting some columns):
+#
+# .. code-block:: sh
+#
+#    ------------------------------  ------------  ------------  ------------  ------------  ------------
+#                              Name      Self XPU    Self XPU %     XPU total  XPU time avg    # of Calls
+#    ------------------------------  ------------  ------------  ------------  ------------  ------------
+#                   model_inference       0.000us         0.00%       2.567ms       2.567ms             1
+#                      aten::conv2d       0.000us         0.00%       1.871ms      93.560us            20
+#                 aten::convolution       0.000us         0.00%       1.871ms      93.560us            20
+#                aten::_convolution       0.000us         0.00%       1.871ms      93.560us            20
+#    aten::convolution_overrideable       1.871ms        72.89%       1.871ms      93.560us            20
+#                          gen_conv       1.484ms        57.82%       1.484ms      74.216us            20
+#                  aten::batch_norm       0.000us         0.00%     432.640us      21.632us            20
+#      aten::_batch_norm_impl_index       0.000us         0.00%     432.640us      21.632us            20
+#           aten::native_batch_norm     432.640us        16.85%     432.640us      21.632us            20
+#                      conv_reorder     386.880us        15.07%     386.880us       6.448us            60
+#    ------------------------------  ------------  ------------  ------------  ------------  ------------
+#    Self CPU time total: 712.486ms
+#    Self XPU time total: 2.567ms
+#
+
+######################################################################
+# Note the occurrence of on-device kernels in the output (e.g. ``sgemm_32x32x32_NN`` for CUDA or ``gen_conv`` for XPU).
+
+######################################################################
+# 4. Using profiler to analyze memory consumption
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# PyTorch profiler can also show the amount of memory (used by the model's tensors)
+# that was allocated (or released) during the execution of the model's operators.
+# In the output below, 'self' memory corresponds to the memory allocated (released)
+# by the operator, excluding the children calls to the other operators.
+# To enable memory profiling functionality pass ``profile_memory=True``.
+
+model = models.resnet18()
+inputs = torch.randn(5, 3, 224, 224)
+
+with profile(
+    activities=[ProfilerActivity.CPU], profile_memory=True, record_shapes=True
+) as prof:
+    model(inputs)
+
+print(prof.key_averages().table(sort_by="self_cpu_memory_usage", row_limit=10))
+
+# (omitting some columns)
+# ---------------------------------  ------------  ------------  ------------
+#                              Name       CPU Mem  Self CPU Mem    # of Calls
+# ---------------------------------  ------------  ------------  ------------
+#                       aten::empty      94.79 Mb      94.79 Mb           121
+#     aten::max_pool2d_with_indices      11.48 Mb      11.48 Mb             1
+#                       aten::addmm      19.53 Kb      19.53 Kb             1
+#               aten::empty_strided         572 b         572 b            25
+#                     aten::resize_         240 b         240 b             6
+#                         aten::abs         480 b         240 b             4
+#                         aten::add         160 b         160 b            20
+#               aten::masked_select         120 b         112 b             1
+#                          aten::ne         122 b          53 b             6
+#                          aten::eq          60 b          30 b             2
+# ---------------------------------  ------------  ------------  ------------
+# Self CPU time total: 53.064ms
+
+print(prof.key_averages().table(sort_by="cpu_memory_usage", row_limit=10))
+
+#############################################################################
+# The output might look like this (omitting some columns):
+#
+# .. code-block:: sh
+#
+#    ---------------------------------  ------------  ------------  ------------
+#                                 Name       CPU Mem  Self CPU Mem    # of Calls
+#    ---------------------------------  ------------  ------------  ------------
+#                          aten::empty      94.79 Mb      94.79 Mb           121
+#                     aten::batch_norm      47.41 Mb           0 b            20
+#         aten::_batch_norm_impl_index      47.41 Mb           0 b            20
+#              aten::native_batch_norm      47.41 Mb           0 b            20
+#                         aten::conv2d      47.37 Mb           0 b            20
+#                    aten::convolution      47.37 Mb           0 b            20
+#                   aten::_convolution      47.37 Mb           0 b            20
+#             aten::mkldnn_convolution      47.37 Mb           0 b            20
+#                     aten::max_pool2d      11.48 Mb           0 b             1
+#        aten::max_pool2d_with_indices      11.48 Mb      11.48 Mb             1
+#    ---------------------------------  ------------  ------------  ------------
+#    Self CPU time total: 53.064ms
+#
+
+######################################################################
+# 5. Using tracing functionality
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# Profiling results can be outputted as a ``.json`` trace file:
+# Tracing CUDA or XPU kernels
+# Users could switch between cpu, cuda and xpu
+activities = [ProfilerActivity.CPU]
+if torch.cuda.is_available():
+    device = "cuda"
+    activities += [ProfilerActivity.CUDA]
+elif torch.xpu.is_available():
+    device = "xpu"
+    activities += [ProfilerActivity.XPU]
+else:
+    print(
+        "Neither CUDA nor XPU devices are available to demonstrate profiling on acceleration devices"
+    )
+    import sys
+
+    sys.exit(0)
+
+model = models.resnet18().to(device)
+inputs = torch.randn(5, 3, 224, 224).to(device)
+
+with profile(activities=activities) as prof:
+    model(inputs)
+
+prof.export_chrome_trace("trace.json")
+
+######################################################################
+# You can examine the sequence of profiled operators and CUDA/XPU kernels
+# in Chrome trace viewer (``chrome://tracing``):
+#
+# .. image:: ../../_static/img/trace_img.png
+#    :scale: 25 %
+
+######################################################################
+# 6. Examining stack traces
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# Profiler can be used to analyze Python and TorchScript stack traces:
+sort_by_keyword = "self_" + device + "_time_total"
+
+with profile(
+    activities=activities,
+    with_stack=True,
+    experimental_config=torch._C._profiler._ExperimentalConfig(verbose=True),
+) as prof:
+    model(inputs)
+
+# Print aggregated stats
+print(prof.key_averages(group_by_stack_n=5).table(sort_by=sort_by_keyword, row_limit=2))
+
+#################################################################################
+# The output might look like this (omitting some columns):
+#
+# .. code-block:: sh
+#
+#    -------------------------  -----------------------------------------------------------
+#                         Name  Source Location
+#    -------------------------  -----------------------------------------------------------
+#    aten::thnn_conv2d_forward  .../torch/nn/modules/conv.py(439): _conv_forward
+#                               .../torch/nn/modules/conv.py(443): forward
+#                               .../torch/nn/modules/module.py(1051): _call_impl
+#                               .../site-packages/torchvision/models/resnet.py(63): forward
+#                               .../torch/nn/modules/module.py(1051): _call_impl
+#    aten::thnn_conv2d_forward  .../torch/nn/modules/conv.py(439): _conv_forward
+#                               .../torch/nn/modules/conv.py(443): forward
+#                               .../torch/nn/modules/module.py(1051): _call_impl
+#                               .../site-packages/torchvision/models/resnet.py(59): forward
+#                               .../torch/nn/modules/module.py(1051): _call_impl
+#    -------------------------  -----------------------------------------------------------
+#    Self CPU time total: 34.016ms
+#    Self CUDA time total: 11.659ms
+#
+
+######################################################################
+# Note the two convolutions and the two call sites in ``torchvision/models/resnet.py`` script.
+#
+# (Warning: stack tracing adds an extra profiling overhead.)
+
+######################################################################
+# 7. Using profiler to analyze long-running jobs
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# PyTorch profiler offers an additional API to handle long-running jobs
+# (such as training loops). Tracing all of the execution can be
+# slow and result in very large trace files. To avoid this, use optional
+# arguments:
+#
+# - ``schedule`` - specifies a function that takes an integer argument (step number)
+#   as an input and returns an action for the profiler, the best way to use this parameter
+#   is to use ``torch.profiler.schedule`` helper function that can generate a schedule for you;
+# - ``on_trace_ready`` - specifies a function that takes a reference to the profiler as
+#   an input and is called by the profiler each time the new trace is ready.
+#
+# To illustrate how the API works, let's first consider the following example with
+# ``torch.profiler.schedule`` helper function:
+
+from torch.profiler import schedule
+
+my_schedule = schedule(skip_first=10, wait=5, warmup=1, active=3, repeat=2)
+
+######################################################################
+# Profiler assumes that the long-running job is composed of steps, numbered
+# starting from zero. The example above defines the following sequence of actions
+# for the profiler:
+#
+# 1. Parameter ``skip_first`` tells profiler that it should ignore the first 10 steps
+#    (default value of ``skip_first`` is zero);
+# 2. After the first ``skip_first`` steps, profiler starts executing profiler cycles;
+# 3. Each cycle consists of three phases:
+#
+#    - idling (``wait=5`` steps), during this phase profiler is not active;
+#    - warming up (``warmup=1`` steps), during this phase profiler starts tracing, but
+#      the results are discarded; this phase is used to discard the samples obtained by
+#      the profiler at the beginning of the trace since they are usually skewed by an extra
+#      overhead;
+#    - active tracing (``active=3`` steps), during this phase profiler traces and records data;
+# 4. An optional ``repeat`` parameter specifies an upper bound on the number of cycles.
+#    By default (zero value), profiler will execute cycles as long as the job runs.
+
+######################################################################
+# Thus, in the example above, profiler will skip the first 15 steps, spend the next step on the warm up,
+# actively record the next 3 steps, skip another 5 steps, spend the next step on the warm up, actively
+# record another 3 steps. Since the ``repeat=2`` parameter value is specified, the profiler will stop
+# the recording after the first two cycles.
+#
+# At the end of each cycle profiler calls the specified ``on_trace_ready`` function and passes itself as
+# an argument. This function is used to process the new trace - either by obtaining the table output or
+# by saving the output on disk as a trace file.
+#
+# To send the signal to the profiler that the next step has started, call ``prof.step()`` function.
+# The current profiler step is stored in ``prof.step_num``.
+#
+# The following example shows how to use all of the concepts above for CUDA and XPU Kernels:
+
+sort_by_keyword = "self_" + device + "_time_total"
+
+
+def trace_handler(p):
+    output = p.key_averages().table(sort_by=sort_by_keyword, row_limit=10)
+    print(output)
+    p.export_chrome_trace("/tmp/trace_" + str(p.step_num) + ".json")
+
+
+with profile(
+    activities=activities,
+    schedule=torch.profiler.schedule(wait=1, warmup=1, active=2),
+    on_trace_ready=trace_handler,
+) as p:
+    for idx in range(8):
+        model(inputs)
+        p.step()
+
+######################################################################
+# Learn More
+# ----------
+#
+# Take a look at the following recipes/tutorials to continue your learning:
+#
+# - `PyTorch Benchmark <https://pytorch.org/tutorials/recipes/recipes/benchmark.html>`_
+# - `Visualizing models, data, and training with TensorBoard <https://pytorch.org/tutorials/intermediate/tensorboard_tutorial.html>`_ tutorial
+#
diff --git a/recipes_source/recipes/reasoning_about_shapes.py b/recipes_source/recipes/reasoning_about_shapes.py
new file mode 100644
index 00000000000..12c85dcb447
--- /dev/null
+++ b/recipes_source/recipes/reasoning_about_shapes.py
@@ -0,0 +1,88 @@
+"""
+Reasoning about Shapes in PyTorch
+=================================
+
+When writing models with PyTorch, it is commonly the case that the parameters
+to a given layer depend on the shape of the output of the previous layer. For
+example, the ``in_features`` of an ``nn.Linear`` layer must match the
+``size(-1)`` of the input. For some layers, the shape computation involves
+complex equations, for example convolution operations.
+
+One way around this is to run the forward pass with random inputs, but this is
+wasteful in terms of memory and compute.
+
+Instead, we can make use of the ``meta`` device to determine the output shapes
+of a layer without materializing any data.
+"""
+
+import torch
+import timeit
+
+t = torch.rand(2, 3, 10, 10, device="meta")
+conv = torch.nn.Conv2d(3, 5, 2, device="meta")
+start = timeit.default_timer()
+out = conv(t)
+end = timeit.default_timer()
+
+print(out)
+print(f"Time taken: {end-start}")
+
+
+##########################################################################
+# Observe that since data is not materialized, passing arbitrarily large
+# inputs will not significantly alter the time taken for shape computation.
+
+t_large = torch.rand(2**10, 3, 2**16, 2**16, device="meta")
+start = timeit.default_timer()
+out = conv(t_large)
+end = timeit.default_timer()
+
+print(out)
+print(f"Time taken: {end-start}")
+
+
+######################################################
+# Consider an arbitrary network such as the following:
+
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class Net(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv1 = nn.Conv2d(3, 6, 5)
+        self.pool = nn.MaxPool2d(2, 2)
+        self.conv2 = nn.Conv2d(6, 16, 5)
+        self.fc1 = nn.Linear(16 * 5 * 5, 120)
+        self.fc2 = nn.Linear(120, 84)
+        self.fc3 = nn.Linear(84, 10)
+
+    def forward(self, x):
+        x = self.pool(F.relu(self.conv1(x)))
+        x = self.pool(F.relu(self.conv2(x)))
+        x = torch.flatten(x, 1) # flatten all dimensions except batch
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        x = self.fc3(x)
+        return x
+
+
+###############################################################################
+# We can view the intermediate shapes within an entire network by registering a
+# forward hook to each layer that prints the shape of the output.
+
+def fw_hook(module, input, output):
+    print(f"Shape of output to {module} is {output.shape}.")
+
+
+# Any tensor created within this torch.device context manager will be
+# on the meta device.
+with torch.device("meta"):
+    net = Net()
+    inp = torch.randn((1024, 3, 32, 32))
+
+for name, layer in net.named_modules():
+    layer.register_forward_hook(fw_hook)
+
+out = net(inp)
diff --git a/recipes_source/recipes/save_load_across_devices.rst b/recipes_source/recipes/save_load_across_devices.rst
new file mode 100644
index 00000000000..fbda1562201
--- /dev/null
+++ b/recipes_source/recipes/save_load_across_devices.rst
@@ -0,0 +1,10 @@
+Save Load Across Devices
+========================
+
+This tutorial was deprecated. There is a newer tutorial that covers the same topic:  https://pytorch.org/tutorials/beginner/saving_loading_models.html
+
+Redirecting in 3 seconds...
+
+.. raw:: html
+
+   <meta http-equiv="Refresh" content="3; url='https://pytorch.org/tutorials/beginner/saving_loading_models.html'" />
diff --git a/recipes_source/recipes/saving_and_loading_a_general_checkpoint.rst b/recipes_source/recipes/saving_and_loading_a_general_checkpoint.rst
new file mode 100644
index 00000000000..b868c26a6cd
--- /dev/null
+++ b/recipes_source/recipes/saving_and_loading_a_general_checkpoint.rst
@@ -0,0 +1,10 @@
+Saving And Loading A General Checkpoint
+=======================================
+
+This tutorial was deprecated. There is a newer tutorial that covers the same topic:  https://pytorch.org/tutorials/beginner/saving_loading_models.html
+
+Redirecting in 3 seconds...
+
+.. raw:: html
+
+   <meta http-equiv="Refresh" content="3; url='https://pytorch.org/tutorials/beginner/saving_loading_models.html'" />
diff --git a/recipes_source/recipes/saving_and_loading_models_for_inference.rst b/recipes_source/recipes/saving_and_loading_models_for_inference.rst
new file mode 100644
index 00000000000..19e1405dd81
--- /dev/null
+++ b/recipes_source/recipes/saving_and_loading_models_for_inference.rst
@@ -0,0 +1,10 @@
+Saving And Loading Models For Inference
+=======================================
+
+This tutorial was deprecated. There is a newer tutorial that covers the same topic:  https://pytorch.org/tutorials/beginner/saving_loading_models.html
+
+Redirecting in 3 seconds...
+
+.. raw:: html
+
+   <meta http-equiv="Refresh" content="3; url='https://pytorch.org/tutorials/beginner/saving_loading_models.html'" />
diff --git a/recipes_source/recipes/saving_multiple_models_in_one_file.rst b/recipes_source/recipes/saving_multiple_models_in_one_file.rst
new file mode 100644
index 00000000000..33040e6c87b
--- /dev/null
+++ b/recipes_source/recipes/saving_multiple_models_in_one_file.rst
@@ -0,0 +1,10 @@
+Saving Multiple Models In One File
+==================================
+
+This tutorial was deprecated. There is a newer tutorial that covers the same topic:  https://pytorch.org/tutorials/beginner/saving_loading_models.html
+
+Redirecting in 3 seconds...
+
+.. raw:: html
+
+   <meta http-equiv="Refresh" content="3; url='https://pytorch.org/tutorials/beginner/saving_loading_models.html'" />
diff --git a/recipes_source/recipes/swap_tensors.py b/recipes_source/recipes/swap_tensors.py
new file mode 100644
index 00000000000..d3b90c6ebea
--- /dev/null
+++ b/recipes_source/recipes/swap_tensors.py
@@ -0,0 +1,241 @@
+"""
+Extension points in ``nn.Module`` for ``load_state_dict`` and tensor subclasses
+===============================================================================
+**Author:** `Mikayla Gawarecki <https://github.com/mikaylagawarecki>`_
+
+This recipe introduces a new utility function ``torch.utils.swap_tensors``
+as well as two new extension points where it has been integrated in
+``nn.Module``:
+
+* ``nn.Module.to()`` and related methods
+* ``nn.Module.load_state_dict()``
+
+.. note::
+    This recipe requires PyTorch 2.3.0 or later.
+"""
+
+###############################################################################
+# ``torch.utils.swap_tensors``
+# ----------------------------
+# ``torch.utils.swap_tensors`` (hereafter referred to as ``swap_tensors``) is a
+# utility function that takes in two Python tensors and swaps them.
+
+import torch
+import torch.nn as nn
+t1 = torch.arange(2)
+t2 = torch.arange(3)
+print(f"Before swapping, t1: {t1}, t2: {t2}")
+torch.utils.swap_tensors(t1, t2)
+print(f"After swapping, t1: {t1}, t2: {t2}")
+
+################################################################################
+# More specifically, ``swap_tensors`` swaps the Python ``__class__``, ``__dict__``
+# and ``__slots__`` of the two tensors, as well as their associated ``at::Tensor``.
+#
+#
+# Application to ``nn.Module``
+# ----------------------------
+# This utility is pertinent to ``nn.Module`` when a Python object outside
+# of the module holds a reference to parameters of the module. If an ``nn.Module``
+# modifies any of its parameters out of place, the object holding references to
+# the parameters will not see the change. A classic example of this is the
+# optimizer, which holds a reference to the parameters of the ``nn.Module``.
+# This leads to a silent correctness issue where the ``optimizer.step()`` will
+# run without error but the weights of the ``nn.Module`` will not be updated.
+
+mod = torch.nn.Linear(1, 2, bias=False)
+optimizer = torch.optim.SGD(mod.parameters())
+print(f"weight in mod: {mod.weight}")
+print(f"weight in optimizer: {optimizer.param_groups[0]['params']}")
+mod.weight = torch.nn.Parameter(2 * mod.weight)
+print(f"weight in mod: {mod.weight}")
+print(f"weight in optimizer: {optimizer.param_groups[0]['params']}")
+
+################################################################################
+# ``nn.Module.to()`` and related methods
+# --------------------------------------
+# This includes methods that change the device of the module (such as ``nn.Module.cpu()``),
+# methods that change the ``dtype`` of the module (such as ``nn.Module.float()``)
+# as well as methods that allow the module to be materialized
+# (such as ``nn.Module.to_empty()``).
+#
+# At first glance, it might be non-intuitive that these methods are able to
+# modify the parameters of the module in-place. The existing approach has been
+# to use a nasty hack dating back from the first days of PyTorch.
+#
+# Notably, the existing approach does not work in these cases:
+#
+# * when using ``__torch_dispatch__`` subclasses
+# * when ``param`` and ``new_param`` do not have the same Python ``type()``
+# * For tensors with special C++ representations (such as sparse tensors and ``XLA`` tensors)
+#
+# In the following part of this recipe, we will define a toy ``__torch_dispatch__``
+# subclass ``MyQuantizedLinearWeight`` that represents quantized linear weights.
+# This subclass will be used for illustration purposes throughout the rest of
+# the tutorial. For brevity, we omit most of the ``__torch_dispatch__``
+# implementation.
+aten = torch.ops.aten
+
+class MyQuantizedLinearWeight(torch.Tensor):
+    @staticmethod
+    def __new__(cls, elem, scale):
+        return torch.Tensor._make_wrapper_subclass(
+            cls,
+            elem.shape,
+            dtype=elem.dtype,
+            layout=elem.layout,
+            device=elem.device,
+            strides=elem.stride(),
+            storage_offset=elem.storage_offset())
+
+    def __init__(self, elem: torch.Tensor, scale: float):
+        self.elem = elem
+        self.scale = scale
+
+    def __repr__(self):
+        return f"MyQuantizedLinearWeight({self.elem}, scale={self.scale})"
+
+    @classmethod
+    def __torch_dispatch__(cls, func, types, args, kwargs):
+        if func in (aten.detach.default, aten._to_copy.default):
+            new_elem = func(args[0].elem, *args[1:], **kwargs)
+            return cls(new_elem, args[0].scale)
+        # Implementations for certain ops would be added to ``OP_TABLE``.
+        # We omit this for brevity.
+        OP_TABLE = dict()
+        if func in OP_TABLE:
+          return OP_TABLE[func](func, args, kwargs)
+        raise NotImplementedError(f"Unsupported function {func}")
+
+#################################################################################
+# Let us create an ``nn.Linear`` layer of ``dtype`` ``torch.float32`` where the weight is
+# a ``MyQuantizedLinearWeight`` and try to convert it to ``torch.bfloat16``.
+# Observe that the weight's ``dtype`` changes as expected. However, the ``dtype``
+# of the subclass' payload (``elem``) does not change.
+
+m = nn.Linear(3, 5, dtype=torch.float32)
+m.weight = torch.nn.Parameter(MyQuantizedLinearWeight(m.weight, 0.5))
+print(f"Before: id(m.weight)={id(m.weight)}, id(m.bias)={id(m.bias)}")
+m.bfloat16()
+print(f"After: id(m.weight)={id(m.weight)}, id(m.bias)={id(m.bias)}")
+print(f"m.weight.dtype: {m.weight.dtype}")
+print(f"m.weight.elem.dtype: {m.weight.elem.dtype}")
+print(f"m.bias.dtype: {m.bias.dtype}")
+
+################################################################################
+# To this end, we introduce a global config
+# ``torch.__future__.set_swap_module_params_on_conversion`` that will use
+# ``swap_tensors`` to swap the parameters of the module while preserving
+# references in place of ``.data`` setting. When this config is set,
+# ``swap_tensors`` will be used during the conversion, which ensures that
+# the ``dtype`` of the payload is properly converted.
+
+torch.__future__.set_swap_module_params_on_conversion(True)
+m = nn.Linear(3, 5, dtype=torch.float32)
+m.weight = torch.nn.Parameter(MyQuantizedLinearWeight(m.weight, 0.5))
+print(f"Before: id(m.weight)={id(m.weight)}, id(m.bias)={id(m.bias)}")
+m.bfloat16()
+print(f"After: id(m.weight)={id(m.weight)}, id(m.bias)={id(m.bias)}")
+print(f"m.weight.dtype: {m.weight.dtype}")
+print(f"m.weight.elem.dtype: {m.weight.elem.dtype}")
+print(f"m.bias.dtype: {m.bias.dtype}")
+torch.__future__.set_swap_module_params_on_conversion(False)
+
+################################################################################
+# ``nn.Module.load_state_dict()``
+# --------------------------------
+# Depending on the value of the ``assign`` keyword argument passed
+# to ``load_state_dict()``, there are two ways to load the ``state_dict``:
+#
+# * ``assign=False``: preserves the properties of ``module.param`` and only takes the values
+#   from ``state_dict['param_name']``
+# * ``assign=True``: preserves the properties and values of ``state_dict['param_name']``.
+#
+#
+# Previously, these were implemented with in-place ``copy_`` and ``__setattr__`` respectively.
+# With the existing implementation, each approach had its own limitations -- ``assign=False``
+# imposes the constraint that the type of the parameter in the ``state_dict`` must
+# be the same as the type of the parameter in the module while ``assign=True`` imposes
+# the constraint that anything that holds references to the module's parameters must
+# be initialized after ``nn.Module.load_state_dict()``.
+#
+# Now, we address both constraints by adding a ``swap_tensors`` path to ``load_state_dict()``
+# and introducing a new extension point ``torch.Tensor.module_load(self, other, assign=False)``.
+# When the ``swap_tensors`` path is enabled via the ``__future__`` mentioned above,
+# we can use a ``__torch_function__`` handler for ``module_load`` to apply a
+# custom transformation to the value in the ``state_dict``. The result of this
+# transformation will be swapped with the parameter in the module.
+#
+# In the following example, we will use the ``MyQuantizedLinearWeight`` subclass
+# defined above to illustrate how we can use these features to apply a
+# custom quantization scheme to the weights of a linear layer when
+# loading the ``state_dict``.
+#
+# Recall that the ``__torch_function__`` handler for ``module_load`` will be
+# invoked if either ``self`` or ``other`` (in this case ``param`` or
+# ``state_dict[param_key]``) are ``MyQuantizedLinearWeight`` subclasses.
+#
+# Assume that we expect the ``state_dict`` to contain plain tensors and the
+# module to contain ``MyQuantizedLinearWeight`` parameters where we want the
+# tensors in the ``state_dict`` to be transformed into the subclass. Then we
+# can define a ``__torch_function__`` handler for ``torch.Tensor.module_load``
+# as such:
+
+@classmethod
+def custom_torch_function(cls, func, types, args=(), kwargs=None):
+    kwargs = {} if kwargs is None else kwargs
+
+    if func is torch.Tensor.module_load:
+        dest, src = args[0], args[1]
+        assert type(dest) == cls and type(src) == torch.Tensor
+        return MyQuantizedLinearWeight(src, dest.scale)
+    else:
+        with torch._C.DisableTorchFunctionSubclass():
+                return func(*args, **kwargs)
+
+MyQuantizedLinearWeight.__torch_function__ = custom_torch_function
+
+#################################################################################
+# First, let us create a skeleton of a model on the meta device to avoid
+# materializing storages. We convert all weights in the modules to
+# ``MyQuantizedLinearWeight`` subclasses while leaving biases intact.
+
+def fn(m):
+    if isinstance(m, nn.Linear):
+        requires_grad = m.weight.requires_grad
+        m.weight = torch.nn.Parameter(
+                    MyQuantizedLinearWeight(m.weight, 0.5), requires_grad=requires_grad
+                   )
+
+with torch.device("meta"):
+    m = nn.Linear(3, 5)
+    m.apply(fn)
+
+#################################################################################
+# We can then load the ``state_dict``. Observe that we use ``assign=True`` because
+# for biases, we want to preserve the properties of the tensor in the ``state_dict``
+# (for example, we do not want the bias to be on the ``meta`` device after loading).
+
+torch.__future__.set_swap_module_params_on_conversion(True)
+print(f"Before: id(weight)={id(m.weight)}, id(bias)={id(m.bias)}")
+print(f"m.state_dict() before load_state_dict():\n {m.state_dict()}")
+state_dict = nn.Linear(3, 5).state_dict()
+print(f"state_dict:\n {state_dict}")
+m.load_state_dict(state_dict, assign=True)
+print(f"After: id(weight)={id(m.weight)}, id(bias)={id(m.bias)}")
+print(f"m.state_dict() after load_state_dict():\n {m.state_dict()}")
+
+#################################################################################
+# The above is a toy example of how we can use the new extension point in
+# ``nn.Module.load_state_dict()``. One can also imagine alternate scenarios such
+# as when we have tensor subclasses in the ``state_dict`` and plain ``nn.Parameters``/
+# tensors in the module or when both are tensor subclasses. Based on the use
+# case, we can define the ``__torch_function__`` handler for ``module_load``
+# to apply the transforms as needed.
+#
+# Conclusion
+# ----------
+# In this recipe, we learned about ``swap_tensors``, the importance
+# of preserving references for parameters in ``nn.Module`` as well as how to
+# use the two new extension points that are gated by
+# ``torch.__future__.set_swap_module_params_on_conversion``.
diff --git a/recipes_source/recipes/tensorboard_with_pytorch.py b/recipes_source/recipes/tensorboard_with_pytorch.py
new file mode 100644
index 00000000000..4bceda81eaf
--- /dev/null
+++ b/recipes_source/recipes/tensorboard_with_pytorch.py
@@ -0,0 +1,130 @@
+"""
+How to use TensorBoard with PyTorch
+===================================
+TensorBoard is a visualization toolkit for machine learning experimentation. 
+TensorBoard allows tracking and visualizing metrics such as loss and accuracy, 
+visualizing the model graph, viewing histograms, displaying images and much more. 
+In this tutorial we are going to cover TensorBoard installation, 
+basic usage with PyTorch, and how to visualize data you logged in TensorBoard UI.
+
+Installation
+----------------------
+PyTorch should be installed to log models and metrics into TensorBoard log 
+directory. The following command will install PyTorch 1.4+ via 
+Anaconda (recommended):
+
+.. code-block:: sh
+
+   $ conda install pytorch torchvision -c pytorch 
+   
+
+or pip
+
+.. code-block:: sh
+
+   $ pip install torch torchvision
+
+"""
+
+######################################################################
+# Using TensorBoard in PyTorch
+# -----------------------------
+# 
+# Let’s now try using TensorBoard with PyTorch! Before logging anything, 
+# we need to create a ``SummaryWriter`` instance.
+#   
+
+import torch
+from torch.utils.tensorboard import SummaryWriter
+writer = SummaryWriter()
+
+######################################################################
+# Writer will output to ``./runs/`` directory by default.
+# 
+
+
+######################################################################
+# Log scalars
+# -----------
+# 
+# In machine learning, it’s important to understand key metrics such as 
+# loss and how they change during training. Scalar helps to save 
+# the loss value of each training step, or the accuracy after each epoch. 
+#
+# To log a scalar value, use 
+# ``add_scalar(tag, scalar_value, global_step=None, walltime=None)``. 
+# For example, lets create a simple linear regression training, and 
+# log loss value using ``add_scalar``
+#
+
+x = torch.arange(-5, 5, 0.1).view(-1, 1)
+y = -5 * x + 0.1 * torch.randn(x.size())
+
+model = torch.nn.Linear(1, 1)
+criterion = torch.nn.MSELoss()
+optimizer = torch.optim.SGD(model.parameters(), lr = 0.1)
+
+def train_model(iter):
+    for epoch in range(iter):
+        y1 = model(x)
+        loss = criterion(y1, y)
+        writer.add_scalar("Loss/train", loss, epoch)
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+        
+train_model(10)
+writer.flush()
+
+
+###################################################################### 
+# Call ``flush()`` method to make sure that all pending events 
+# have been written to disk.
+# 
+# See `torch.utils.tensorboard tutorials <https://pytorch.org/docs/stable/tensorboard.html>`_ 
+# to find more TensorBoard visualization types you can log.
+# 
+# If you do not need the summary writer anymore, call ``close()`` method.
+#
+
+writer.close()
+
+######################################################################
+# Run TensorBoard
+# ----------------
+# 
+# Install TensorBoard through the command line to visualize data you logged
+#
+# .. code-block:: sh
+#
+#    pip install tensorboard
+#
+#
+# Now, start TensorBoard, specifying the root log directory you used above. 
+# Argument ``logdir`` points to directory where TensorBoard will look to find 
+# event files that it can display. TensorBoard will recursively walk 
+# the directory structure rooted at ``logdir``, looking for ``.*tfevents.*`` files.
+#
+# .. code-block:: sh
+#
+#    tensorboard --logdir=runs
+#
+# Go to the URL it provides OR to `http://localhost:6006/ <http://localhost:6006/>`_
+#
+# .. image:: ../../_static/img/thumbnails/tensorboard_scalars.png
+#    :scale: 40 %
+#
+# This dashboard shows how the loss and accuracy change with every epoch. 
+# You can use it to also track training speed, learning rate, and other 
+# scalar values. It’s helpful to compare these metrics across different 
+# training runs to improve your model.
+#
+
+
+########################################################################
+# Learn More
+# ----------------------------
+# 
+# -  `torch.utils.tensorboard <https://pytorch.org/docs/stable/tensorboard.html>`_ docs
+# -  `Visualizing models, data, and training with TensorBoard <https://pytorch.org/tutorials/intermediate/tensorboard_tutorial.html>`_ tutorial
+#
diff --git a/recipes_source/recipes/timer_quick_start.py b/recipes_source/recipes/timer_quick_start.py
new file mode 100644
index 00000000000..d6b79e094c7
--- /dev/null
+++ b/recipes_source/recipes/timer_quick_start.py
@@ -0,0 +1,397 @@
+"""
+Timer quick start
+=================
+
+In this tutorial, we're going to cover the primary APIs of
+`torch.utils.benchmark.Timer`. The PyTorch Timer is based on the
+`timeit.Timer <https://docs.python.org/3/library/timeit.html#timeit.Timer>`__
+API, with several PyTorch specific modifications. Familiarity with the
+builtin `Timer` class is not required for this tutorial, however we assume
+that the reader is familiar with the fundamentals of performance work.
+
+For a more comprehensive performance tuning tutorial, see
+`PyTorch Benchmark <https://pytorch.org/tutorials/recipes/recipes/benchmark.html>`__.
+
+
+**Contents:**
+    1. `Defining a Timer <#defining-a-timer>`__
+    2. `Wall time: Timer.blocked_autorange(...) <#wall-time-timer-blocked-autorange>`__
+    3. `C++ snippets <#c-snippets>`__
+    4. `Instruction counts: Timer.collect_callgrind(...) <#instruction-counts-timer-collect-callgrind>`__
+    5. `Instruction counts: Delving deeper <#instruction-counts-delving-deeper>`__
+    6. `A/B testing with Callgrind <#a-b-testing-with-callgrind>`__
+    7. `Wrapping up <#wrapping-up>`__
+    8. `Footnotes <#footnotes>`__
+"""
+
+
+###############################################################################
+# 1. Defining a Timer
+# ~~~~~~~~~~~~~~~~~~~
+#
+# A `Timer` serves as a task definition.
+#
+
+from torch.utils.benchmark import Timer
+
+timer = Timer(
+    # The computation which will be run in a loop and timed.
+    stmt="x * y",
+
+    # `setup` will be run before calling the measurement loop, and is used to
+    # populate any state which is needed by `stmt`
+    setup="""
+        x = torch.ones((128,))
+        y = torch.ones((128,))
+    """,
+
+    # Alternatively, ``globals`` can be used to pass variables from the outer scope.
+    # 
+    #    globals={
+    #        "x": torch.ones((128,)),
+    #        "y": torch.ones((128,)),
+    #    },
+
+    # Control the number of threads that PyTorch uses. (Default: 1)
+    num_threads=1,
+)
+
+###############################################################################
+# 2. Wall time: ``Timer.blocked_autorange(...)``
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# This method will handle details such as picking a suitable number if repeats,
+# fixing the number of threads, and providing a convenient representation of
+# the results.
+#
+
+# Measurement objects store the results of multiple repeats, and provide
+# various utility features.
+from torch.utils.benchmark import Measurement
+
+m: Measurement = timer.blocked_autorange(min_run_time=1)
+print(m)
+
+###############################################################################
+# .. code-block:: none
+#    :caption: **Snippet wall time.**
+#
+#         <torch.utils.benchmark.utils.common.Measurement object at 0x7f1929a38ed0>
+#         x * y
+#         setup:
+#           x = torch.ones((128,))
+#           y = torch.ones((128,))
+#
+#           Median: 2.34 us
+#           IQR:    0.07 us (2.31 to 2.38)
+#           424 measurements, 1000 runs per measurement, 1 thread
+#
+
+###############################################################################
+# 3. C++ snippets
+# ~~~~~~~~~~~~~~~
+#
+
+from torch.utils.benchmark import Language
+
+cpp_timer = Timer(
+    "x * y;",
+    """
+        auto x = torch::ones({128});
+        auto y = torch::ones({128});
+    """,
+    language=Language.CPP,
+)
+
+print(cpp_timer.blocked_autorange(min_run_time=1))
+
+###############################################################################
+# .. code-block:: none
+#    :caption: **C++ snippet wall time.**
+#
+#         <torch.utils.benchmark.utils.common.Measurement object at 0x7f192b019ed0>
+#         x * y;
+#         setup:
+#           auto x = torch::ones({128});
+#           auto y = torch::ones({128});
+#
+#           Median: 1.21 us
+#           IQR:    0.03 us (1.20 to 1.23)
+#           83 measurements, 10000 runs per measurement, 1 thread
+#
+
+###############################################################################
+# Unsurprisingly, the C++ snippet is both faster and has lower variation.
+#
+
+###############################################################################
+# 4. Instruction counts: ``Timer.collect_callgrind(...)``
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# For deep dive investigations, ``Timer.collect_callgrind`` wraps
+# `Callgrind <https://valgrind.org/docs/manual/cl-manual.html>`__ in order to
+# collect instruction counts. These are useful as they offer fine grained and
+# deterministic (or very low noise in the case of Python) insights into how a
+# snippet is run.
+#
+
+from torch.utils.benchmark import CallgrindStats, FunctionCounts
+
+stats: CallgrindStats = cpp_timer.collect_callgrind()
+print(stats)
+
+###############################################################################
+# .. code-block:: none
+#    :caption: **C++ Callgrind stats (summary)**
+#
+#         <torch.utils.benchmark.utils.valgrind_wrapper.timer_interface.CallgrindStats object at 0x7f1929a35850>
+#         x * y;
+#         setup:
+#           auto x = torch::ones({128});
+#           auto y = torch::ones({128});
+#
+#                                 All          Noisy symbols removed
+#             Instructions:       563600                     563600
+#             Baseline:                0                          0
+#         100 runs per measurement, 1 thread
+#
+
+###############################################################################
+# 5. Instruction counts: Delving deeper
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# The string representation of ``CallgrindStats`` is similar to that of
+# Measurement. `Noisy symbols` are a Python concept (removing calls in the
+# CPython interpreter which are known to be noisy).
+#
+# For more detailed analysis, however, we will want to look at specific calls.
+# ``CallgrindStats.stats()`` returns a ``FunctionCounts`` object to make this easier.
+# Conceptually, ``FunctionCounts`` can be thought of as a tuple of pairs with some
+# utility methods, where each pair is `(number of instructions, file path and
+# function name)`.
+#
+# A note on paths:
+#   One generally doesn't care about absolute path. For instance, the full path
+#   and function name for a multiply call is something like:
+#
+# .. code-block:: sh
+#
+#    /the/prefix/to/your/pytorch/install/dir/pytorch/build/aten/src/ATen/core/TensorMethods.cpp:at::Tensor::mul(at::Tensor const&) const [/the/path/to/your/conda/install/miniconda3/envs/ab_ref/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so]
+#
+#   when in reality, all of the information that we're interested in can be
+#   represented in:
+#
+# .. code-block:: sh
+#
+#    build/aten/src/ATen/core/TensorMethods.cpp:at::Tensor::mul(at::Tensor const&) const
+#
+#   ``CallgrindStats.as_standardized()`` makes a best effort to strip low signal
+#   portions of the file path, as well as the shared object and is generally
+#   recommended.
+#
+
+inclusive_stats = stats.as_standardized().stats(inclusive=False)
+print(inclusive_stats[:10])
+
+###############################################################################
+# .. code-block:: none
+#    :caption: **C++ Callgrind stats (detailed)**
+#
+#         torch.utils.benchmark.utils.valgrind_wrapper.timer_interface.FunctionCounts object at 0x7f192a6dfd90>
+#           47264  ???:_int_free
+#           25963  ???:_int_malloc
+#           19900  build/../aten/src/ATen/TensorIter ... (at::TensorIteratorConfig const&)
+#           18000  ???:__tls_get_addr
+#           13500  ???:malloc
+#           11300  build/../c10/util/SmallVector.h:a ... (at::TensorIteratorConfig const&)
+#           10345  ???:_int_memalign
+#           10000  build/../aten/src/ATen/TensorIter ... (at::TensorIteratorConfig const&)
+#            9200  ???:free
+#            8000  build/../c10/util/SmallVector.h:a ... IteratorBase::get_strides() const
+#
+#         Total: 173472
+#
+
+###############################################################################
+# That's still quite a lot to digest. Let's use the `FunctionCounts.transform`
+# method to trim some of the function path, and discard the function called.
+# When we do, the counts of any collisions (e.g. `foo.h:a()` and `foo.h:b()`
+# will both map to `foo.h`) will be added together.
+#
+
+import os
+import re
+
+def group_by_file(fn_name: str):
+    if fn_name.startswith("???"):
+        fn_dir, fn_file = fn_name.split(":")[:2]
+    else:
+        fn_dir, fn_file = os.path.split(fn_name.split(":")[0])
+        fn_dir = re.sub("^.*build/../", "", fn_dir)
+        fn_dir = re.sub("^.*torch/", "torch/", fn_dir)
+
+    return f"{fn_dir:<15} {fn_file}"
+
+print(inclusive_stats.transform(group_by_file)[:10])
+
+###############################################################################
+# .. code-block:: none
+#    :caption: **Callgrind stats (condensed)**
+#
+#         <torch.utils.benchmark.utils.valgrind_wrapper.timer_interface.FunctionCounts object at 0x7f192995d750>
+#           118200  aten/src/ATen   TensorIterator.cpp
+#            65000  c10/util        SmallVector.h
+#            47264  ???             _int_free
+#            25963  ???             _int_malloc
+#            20900  c10/util        intrusive_ptr.h
+#            18000  ???             __tls_get_addr
+#            15900  c10/core        TensorImpl.h
+#            15100  c10/core        CPUAllocator.cpp
+#            13500  ???             malloc
+#            12500  c10/core        TensorImpl.cpp
+#
+#         Total: 352327
+#
+
+###############################################################################
+# 6. A/B testing with ``Callgrind``
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# One of the most useful features of instruction counts is they allow fine
+# grained comparison of computation, which is critical when analyzing
+# performance.
+#
+# To see this in action, lets compare our multiplication of two size 128
+# Tensors with a {128} x {1} multiplication, which will broadcast the second
+# Tensor:
+#   result = {a0 * b0, a1 * b0, ..., a127 * b0}
+#
+
+broadcasting_stats = Timer(
+    "x * y;",
+    """
+        auto x = torch::ones({128});
+        auto y = torch::ones({1});
+    """,
+    language=Language.CPP,
+).collect_callgrind().as_standardized().stats(inclusive=False)
+
+###############################################################################
+# Often we want to A/B test two different environments. (e.g. testing a PR, or
+# experimenting with compile flags.) This is quite simple, as ``CallgrindStats``,
+# ``FunctionCounts``, and Measurement are all pickleable. Simply save measurements
+# from each environment, and load them in a single process for analysis.
+#
+
+import pickle
+
+# Let's round trip `broadcasting_stats` just to show that we can.
+broadcasting_stats = pickle.loads(pickle.dumps(broadcasting_stats))
+
+
+# And now to diff the two tasks:
+delta = broadcasting_stats - inclusive_stats
+
+def extract_fn_name(fn: str):
+    """Trim everything except the function name."""
+    fn = ":".join(fn.split(":")[1:])
+    return re.sub(r"\(.+\)", "(...)", fn)
+
+# We use `.transform` to make the diff readable:
+print(delta.transform(extract_fn_name))
+
+
+###############################################################################
+# .. code-block:: none
+#    :caption: **Instruction count delta**
+#
+#         <torch.utils.benchmark.utils.valgrind_wrapper.timer_interface.FunctionCounts object at 0x7f192995d750>
+#             17600  at::TensorIteratorBase::compute_strides(...)
+#             12700  at::TensorIteratorBase::allocate_or_resize_outputs()
+#             10200  c10::SmallVectorImpl<long>::operator=(...)
+#              7400  at::infer_size(...)
+#              6200  at::TensorIteratorBase::invert_perm(...) const
+#              6064  _int_free
+#              5100  at::TensorIteratorBase::reorder_dimensions()
+#              4300  malloc
+#              4300  at::TensorIteratorBase::compatible_stride(...) const
+#               ...
+#               -28  _int_memalign
+#              -100  c10::impl::check_tensor_options_and_extract_memory_format(...)
+#              -300  __memcmp_avx2_movbe
+#              -400  at::detail::empty_cpu(...)
+#             -1100  at::TensorIteratorBase::numel() const
+#             -1300  void at::native::(...)
+#             -2400  c10::TensorImpl::is_contiguous(...) const
+#             -6100  at::TensorIteratorBase::compute_fast_setup_type(...)
+#            -22600  at::TensorIteratorBase::fast_set_up(...)
+#
+#         Total: 58091
+#
+
+###############################################################################
+# So the broadcasting version takes an extra 580 instructions per call (recall
+# that we're collecting 100 runs per sample), or about 10%. There are quite a
+# few ``TensorIterator`` calls, so lets drill down to those. ``FunctionCounts.filter``
+# makes this easy.
+#
+
+print(delta.transform(extract_fn_name).filter(lambda fn: "TensorIterator" in fn))
+
+###############################################################################
+# .. code-block:: none
+#    :caption: **Instruction count delta (filter)**
+#
+#         <torch.utils.benchmark.utils.valgrind_wrapper.timer_interface.FunctionCounts object at 0x7f19299544d0>
+#             17600  at::TensorIteratorBase::compute_strides(...)
+#             12700  at::TensorIteratorBase::allocate_or_resize_outputs()
+#              6200  at::TensorIteratorBase::invert_perm(...) const
+#              5100  at::TensorIteratorBase::reorder_dimensions()
+#              4300  at::TensorIteratorBase::compatible_stride(...) const
+#              4000  at::TensorIteratorBase::compute_shape(...)
+#              2300  at::TensorIteratorBase::coalesce_dimensions()
+#              1600  at::TensorIteratorBase::build(...)
+#             -1100  at::TensorIteratorBase::numel() const
+#             -6100  at::TensorIteratorBase::compute_fast_setup_type(...)
+#            -22600  at::TensorIteratorBase::fast_set_up(...)
+#
+#         Total: 24000
+#
+
+###############################################################################
+# This makes plain what is going on: there is a fast path in ``TensorIterator``
+# setup, but in the {128} x {1} case we miss it and have to do a more general
+# analysis which is more expensive. The most prominent call omitted by the
+# filter is `c10::SmallVectorImpl<long>::operator=(...)`, which is also part
+# of the more general setup.
+#
+
+###############################################################################
+# 7. Wrapping up
+# ~~~~~~~~~~~~~~
+#
+# In summary, use `Timer.blocked_autorange` to collect wall times. If timing
+# variation is too high, increase `min_run_time`, or move to C++ snippets if
+# convenient.
+#
+# For fine grained analysis, use `Timer.collect_callgrind` to measure
+# instruction counts and `FunctionCounts.(__add__ / __sub__ / transform / filter)`
+# to slice-and-dice them.
+#
+
+###############################################################################
+# 8. Footnotes
+# ~~~~~~~~~~~~
+#
+#   - Implied ``import torch``
+#       If `globals` does not contain "torch", Timer will automatically
+#       populate it. This means that ``Timer("torch.empty(())")`` will work.
+#       (Though other imports should be placed in `setup`,
+#       e.g. ``Timer("np.zeros(())", "import numpy as np")``)
+#
+#   - ``REL_WITH_DEB_INFO``
+#       In order to provide full information about the PyTorch internals which
+#       are executed, ``Callgrind`` needs access to C++ debug symbols. This is
+#       accomplished by setting ``REL_WITH_DEB_INFO=1`` when building PyTorch.
+#       Otherwise function calls will be opaque. (The resultant ``CallgrindStats``
+#       will warn if debug symbols are missing.)
diff --git a/recipes_source/recipes/tuning_guide.py b/recipes_source/recipes/tuning_guide.py
new file mode 100644
index 00000000000..dd13b47a6dd
--- /dev/null
+++ b/recipes_source/recipes/tuning_guide.py
@@ -0,0 +1,534 @@
+"""
+Performance Tuning Guide
+*************************
+**Author**: `Szymon Migacz <https://github.com/szmigacz>`_
+
+Performance Tuning Guide is a set of optimizations and best practices which can
+accelerate training and inference of deep learning models in PyTorch. Presented
+techniques often can be implemented by changing only a few lines of code and can
+be applied to a wide range of deep learning models across all domains.
+
+.. grid:: 2
+
+    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn
+       :class-card: card-prerequisites
+
+       * General optimization techniques for PyTorch models
+       * CPU-specific performance optimizations
+       * GPU acceleration strategies
+       * Distributed training optimizations
+
+    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites
+       :class-card: card-prerequisites
+
+       * PyTorch 2.0 or later
+       * Python 3.8 or later
+       * CUDA-capable GPU (recommended for GPU optimizations)
+       * Linux, macOS, or Windows operating system
+
+Overview
+--------
+
+Performance optimization is crucial for efficient deep learning model training and inference.
+This tutorial covers a comprehensive set of techniques to accelerate PyTorch workloads across
+different hardware configurations and use cases.
+
+General optimizations
+---------------------
+"""
+
+import torch
+import torchvision
+
+###############################################################################
+# Enable asynchronous data loading and augmentation
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# `torch.utils.data.DataLoader <https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader>`_
+# supports asynchronous data loading and data augmentation in separate worker
+# subprocesses. The default setting for ``DataLoader`` is ``num_workers=0``,
+# which means that the data loading is synchronous and done in the main process.
+# As a result the main training process has to wait for the data to be available
+# to continue the execution.
+#
+# Setting ``num_workers > 0`` enables asynchronous data loading and overlap
+# between the training and data loading. ``num_workers`` should be tuned
+# depending on the workload, CPU, GPU, and location of training data.
+#
+# ``DataLoader`` accepts ``pin_memory`` argument, which defaults to ``False``.
+# When using a GPU it's better to set ``pin_memory=True``, this instructs
+# ``DataLoader`` to use pinned memory and enables faster and asynchronous memory
+# copy from the host to the GPU.
+
+###############################################################################
+# Disable gradient calculation for validation or inference
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# PyTorch saves intermediate buffers from all operations which involve tensors
+# that require gradients. Typically gradients aren't needed for validation or
+# inference.
+# `torch.no_grad() <https://pytorch.org/docs/stable/generated/torch.no_grad.html#torch.no_grad>`_
+# context manager can be applied to disable gradient calculation within a
+# specified block of code, this accelerates execution and reduces the amount of
+# required memory.
+# `torch.no_grad() <https://pytorch.org/docs/stable/generated/torch.no_grad.html#torch.no_grad>`_
+# can also be used as a function decorator.
+
+###############################################################################
+# Disable bias for convolutions directly followed by a batch norm
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# `torch.nn.Conv2d() <https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html#torch.nn.Conv2d>`_
+# has ``bias`` parameter which defaults to ``True`` (the same is true for
+# `Conv1d <https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html#torch.nn.Conv1d>`_
+# and
+# `Conv3d <https://pytorch.org/docs/stable/generated/torch.nn.Conv3d.html#torch.nn.Conv3d>`_
+# ).
+#
+# If a ``nn.Conv2d`` layer is directly followed by a ``nn.BatchNorm2d`` layer,
+# then the bias in the convolution is not needed, instead use
+# ``nn.Conv2d(..., bias=False, ....)``. Bias is not needed because in the first
+# step ``BatchNorm`` subtracts the mean, which effectively cancels out the
+# effect of bias.
+#
+# This is also applicable to 1d and 3d convolutions as long as ``BatchNorm`` (or
+# other normalization layer) normalizes on the same dimension as convolution's
+# bias.
+#
+# Models available from `torchvision <https://github.com/pytorch/vision>`_
+# already implement this optimization.
+
+###############################################################################
+# Use parameter.grad = None instead of model.zero_grad() or optimizer.zero_grad()
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# Instead of calling:
+model.zero_grad()
+# or
+optimizer.zero_grad()
+
+###############################################################################
+# to zero out gradients, use the following method instead:
+
+for param in model.parameters():
+    param.grad = None
+
+###############################################################################
+# The second code snippet does not zero the memory of each individual parameter,
+# also the subsequent backward pass uses assignment instead of addition to store
+# gradients, this reduces the number of memory operations.
+#
+# Setting gradient to ``None`` has a slightly different numerical behavior than
+# setting it to zero, for more details refer to the
+# `documentation <https://pytorch.org/docs/master/optim.html#torch.optim.Optimizer.zero_grad>`_.
+#
+# Alternatively, call ``model`` or
+# ``optimizer.zero_grad(set_to_none=True)``.
+
+###############################################################################
+# Fuse operations
+# ~~~~~~~~~~~~~~~~~~~~~~~~~
+# Pointwise operations such as elementwise addition, multiplication, and math
+# functions like `sin()`, `cos()`, `sigmoid()`, etc., can be combined into a
+# single kernel. This fusion helps reduce memory access and kernel launch times.
+# Typically, pointwise operations are memory-bound; PyTorch eager-mode initiates
+# a separate kernel for each operation, which involves loading data from memory,
+# executing the operation (often not the most time-consuming step), and writing
+# the results back to memory.
+# 
+# By using a fused operator, only one kernel is launched for multiple pointwise
+# operations, and data is loaded and stored just once. This efficiency is
+# particularly beneficial for activation functions, optimizers, and custom RNN cells etc.
+#
+# PyTorch 2 introduces a compile-mode facilitated by TorchInductor, an underlying compiler
+# that automatically fuses kernels. TorchInductor extends its capabilities beyond simple
+# element-wise operations, enabling advanced fusion of eligible pointwise and reduction
+# operations for optimized performance.
+#
+# In the simplest case fusion can be enabled by applying
+# `torch.compile <https://pytorch.org/docs/stable/generated/torch.compile.html>`_
+# decorator to the function definition, for example:
+
+@torch.compile
+def gelu(x):
+    return x * 0.5 * (1.0 + torch.erf(x / 1.41421))
+
+###############################################################################
+# Refer to
+# `Introduction to torch.compile <https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html>`_
+# for more advanced use cases.
+
+###############################################################################
+# Enable channels_last memory format for computer vision models
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# PyTorch supports ``channels_last`` memory format for
+# convolutional networks. This format is meant to be used in conjunction with
+# `AMP <https://pytorch.org/docs/stable/amp.html>`_ to further accelerate
+# convolutional neural networks with
+# `Tensor Cores <https://www.nvidia.com/en-us/data-center/tensor-cores/>`_.
+#
+# Support for ``channels_last`` is experimental, but it's expected to work for
+# standard computer vision models (e.g. ResNet-50, SSD). To convert models to
+# ``channels_last`` format follow
+# `Channels Last Memory Format Tutorial <https://pytorch.org/tutorials/intermediate/memory_format_tutorial.html>`_.
+# The tutorial includes a section on
+# `converting existing models <https://pytorch.org/tutorials/intermediate/memory_format_tutorial.html#converting-existing-models>`_.
+
+###############################################################################
+# Checkpoint intermediate buffers
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# Buffer checkpointing is a technique to mitigate the memory capacity burden of
+# model training. Instead of storing inputs of all layers to compute upstream
+# gradients in backward propagation, it stores the inputs of a few layers and
+# the others are recomputed during backward pass. The reduced memory
+# requirements enables increasing the batch size that can improve utilization.
+#
+# Checkpointing targets should be selected carefully. The best is not to store
+# large layer outputs that have small re-computation cost. The example target
+# layers are activation functions (e.g. ``ReLU``, ``Sigmoid``, ``Tanh``),
+# up/down sampling and matrix-vector operations with small accumulation depth.
+#
+# PyTorch supports a native
+# `torch.utils.checkpoint <https://pytorch.org/docs/stable/checkpoint.html>`_
+# API to automatically perform checkpointing and recomputation.
+
+###############################################################################
+# Disable debugging APIs
+# ~~~~~~~~~~~~~~~~~~~~~~
+# Many PyTorch APIs are intended for debugging and should be disabled for
+# regular training runs:
+#
+# * anomaly detection:
+#   `torch.autograd.detect_anomaly <https://pytorch.org/docs/stable/autograd.html#torch.autograd.detect_anomaly>`_
+#   or
+#   `torch.autograd.set_detect_anomaly(True) <https://pytorch.org/docs/stable/autograd.html#torch.autograd.set_detect_anomaly>`_
+# * profiler related:
+#   `torch.autograd.profiler.emit_nvtx <https://pytorch.org/docs/stable/autograd.html#torch.autograd.profiler.emit_nvtx>`_,
+#   `torch.autograd.profiler.profile <https://pytorch.org/docs/stable/autograd.html#torch.autograd.profiler.profile>`_
+# * autograd ``gradcheck``:
+#   `torch.autograd.gradcheck <https://pytorch.org/docs/stable/autograd.html#torch.autograd.gradcheck>`_
+#   or
+#   `torch.autograd.gradgradcheck <https://pytorch.org/docs/stable/autograd.html#torch.autograd.gradgradcheck>`_
+#
+
+###############################################################################
+# CPU specific optimizations
+# --------------------------
+
+###############################################################################
+# Utilize Non-Uniform Memory Access (NUMA) Controls
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# NUMA or non-uniform memory access is a memory layout design used in data center machines meant to take advantage of locality of memory in multi-socket machines with multiple memory controllers and blocks. Generally speaking, all deep learning workloads, training or inference, get better performance without accessing hardware resources across NUMA nodes. Thus, inference can be run with multiple instances, each instance runs on one socket, to raise throughput. For training tasks on single node, distributed training is recommended to make each training process run on one socket.
+#
+# In general cases the following command executes a PyTorch script on cores on the Nth node only, and avoids cross-socket memory access to reduce memory access overhead.
+#
+# .. code-block:: sh
+#
+#    numactl --cpunodebind=N --membind=N python <pytorch_script>
+
+###############################################################################
+# More detailed descriptions can be found `here <https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html>`_.
+
+###############################################################################
+# Utilize OpenMP
+# ~~~~~~~~~~~~~~
+# OpenMP is utilized to bring better performance for parallel computation tasks.
+# ``OMP_NUM_THREADS`` is the easiest switch that can be used to accelerate computations. It determines number of threads used for OpenMP computations.
+# CPU affinity setting controls how workloads are distributed over multiple cores. It affects communication overhead, cache line invalidation overhead, or page thrashing, thus proper setting of CPU affinity brings performance benefits. ``GOMP_CPU_AFFINITY`` or ``KMP_AFFINITY`` determines how to bind OpenMP* threads to physical processing units. Detailed information can be found `here <https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html>`_.
+
+###############################################################################
+# With the following command, PyTorch run the task on N OpenMP threads.
+#
+# .. code-block:: sh
+#
+#    export OMP_NUM_THREADS=N
+
+###############################################################################
+# Typically, the following environment variables are used to set for CPU affinity with GNU OpenMP implementation. ``OMP_PROC_BIND`` specifies whether threads may be moved between processors. Setting it to CLOSE keeps OpenMP threads close to the primary thread in contiguous place partitions. ``OMP_SCHEDULE`` determines how OpenMP threads are scheduled. ``GOMP_CPU_AFFINITY`` binds threads to specific CPUs.
+# An important tuning parameter is core pinning which prevent the threads of migrating between multiple CPUs, enhancing data location and minimizing inter core communication.
+#
+# .. code-block:: sh
+#
+#    export OMP_SCHEDULE=STATIC
+#    export OMP_PROC_BIND=CLOSE
+#    export GOMP_CPU_AFFINITY="N-M"
+
+###############################################################################
+# Intel OpenMP Runtime Library (``libiomp``)
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# By default, PyTorch uses GNU OpenMP (GNU ``libgomp``) for parallel computation. On Intel platforms, Intel OpenMP Runtime Library (``libiomp``) provides OpenMP API specification support. It sometimes brings more performance benefits compared to ``libgomp``. Utilizing environment variable ``LD_PRELOAD`` can switch OpenMP library to ``libiomp``:
+#
+# .. code-block:: sh
+#
+#    export LD_PRELOAD=<path>/libiomp5.so:$LD_PRELOAD
+
+###############################################################################
+# Similar to CPU affinity settings in GNU OpenMP, environment variables are provided in ``libiomp`` to control CPU affinity settings.
+# ``KMP_AFFINITY`` binds OpenMP threads to physical processing units. ``KMP_BLOCKTIME`` sets the time, in milliseconds, that a thread should wait, after completing the execution of a parallel region, before sleeping. In most cases, setting ``KMP_BLOCKTIME`` to 1 or 0 yields good performances.
+# The following commands show a common settings with Intel OpenMP Runtime Library.
+#
+# .. code-block:: sh
+#
+#    export KMP_AFFINITY=granularity=fine,compact,1,0
+#    export KMP_BLOCKTIME=1
+
+###############################################################################
+# Switch Memory allocator
+# ~~~~~~~~~~~~~~~~~~~~~~~
+# For deep learning workloads, ``Jemalloc`` or ``TCMalloc`` can get better performance by reusing memory as much as possible than default ``malloc`` function. `Jemalloc <https://github.com/jemalloc/jemalloc>`_ is a general purpose ``malloc`` implementation that emphasizes fragmentation avoidance and scalable concurrency support. `TCMalloc <https://google.github.io/tcmalloc/overview.html>`_ also features a couple of optimizations to speed up program executions. One of them is holding memory in caches to speed up access of commonly-used objects. Holding such caches even after deallocation also helps avoid costly system calls if such memory is later re-allocated.
+# Use environment variable ``LD_PRELOAD`` to take advantage of one of them.
+#
+# .. code-block:: sh
+#
+#    export LD_PRELOAD=<jemalloc.so/tcmalloc.so>:$LD_PRELOAD
+
+
+###############################################################################
+# Train a model on CPU with PyTorch ``DistributedDataParallel``(DDP) functionality
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# For small scale models or memory-bound models, such as DLRM, training on CPU is also a good choice. On a machine with multiple sockets, distributed training brings a high-efficient hardware resource usage to accelerate the training process. `Torch-ccl <https://github.com/intel/torch-ccl>`_, optimized with Intel(R) ``oneCCL`` (collective communications library) for efficient distributed deep learning training implementing such collectives like ``allreduce``, ``allgather``, ``alltoall``, implements PyTorch C10D ``ProcessGroup`` API and can be dynamically loaded as external ``ProcessGroup``. Upon optimizations implemented in PyTorch DDP module, ``torch-ccl`` accelerates communication operations. Beside the optimizations made to communication kernels, ``torch-ccl`` also features simultaneous computation-communication functionality.
+
+###############################################################################
+# GPU specific optimizations
+# --------------------------
+
+###############################################################################
+# Enable Tensor cores
+# ~~~~~~~~~~~~~~~~~~~~~~~
+# Tensor cores are specialized hardware designed to compute matrix-matrix multiplication
+# operations, primarily utilized in deep learning and AI workloads. Tensor cores have
+# specific precision requirements which can be adjusted manually or via the Automatic
+# Mixed Precision API.
+#
+# In particular, tensor operations take advantage of lower precision workloads.
+# Which can be controlled via ``torch.set_float32_matmul_precision``.
+# The default format is set to 'highest,' which utilizes the tensor data type. 
+# However, PyTorch offers alternative precision settings: 'high' and 'medium.'
+# These options prioritize computational speed over numerical precision."
+
+###############################################################################
+# Use CUDA Graphs
+# ~~~~~~~~~~~~~~~~~~~~~~~
+# At the time of using a GPU, work first must be launched from the CPU and
+# in some cases the context switch between CPU and GPU can lead to bad resource
+# utilization. CUDA graphs are a way to keep computation within the GPU without
+# paying the extra cost of kernel launches and host synchronization.
+
+# It can be enabled using 
+torch.compile(m, "reduce-overhead")
+# or
+torch.compile(m, "max-autotune")
+
+###############################################################################
+# Support for CUDA graph is in development, and its usage can incur in increased
+# device memory consumption and some models might not compile.
+
+###############################################################################
+# Enable cuDNN auto-tuner
+# ~~~~~~~~~~~~~~~~~~~~~~~
+# `NVIDIA cuDNN <https://developer.nvidia.com/cudnn>`_ supports many algorithms
+# to compute a convolution. Autotuner runs a short benchmark and selects the
+# kernel with the best performance on a given hardware for a given input size.
+#
+# For convolutional networks (other types currently not supported), enable cuDNN
+# autotuner before launching the training loop by setting:
+
+torch.backends.cudnn.benchmark = True
+###############################################################################
+#
+# * the auto-tuner decisions may be non-deterministic; different algorithm may
+#   be selected for different runs.  For more details see
+#   `PyTorch: Reproducibility <https://pytorch.org/docs/stable/notes/randomness.html?highlight=determinism>`_
+# * in some rare cases, such as with highly variable input sizes,  it's better
+#   to run convolutional networks with autotuner disabled to avoid the overhead
+#   associated with algorithm selection for each input size.
+#
+
+###############################################################################
+# Avoid unnecessary CPU-GPU synchronization
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# Avoid unnecessary synchronizations, to let the CPU run ahead of the
+# accelerator as much as possible to make sure that the accelerator work queue
+# contains many operations.
+#
+# When possible, avoid operations which require synchronizations, for example:
+#
+# * ``print(cuda_tensor)``
+# * ``cuda_tensor.item()``
+# * memory copies: ``tensor.cuda()``,  ``cuda_tensor.cpu()`` and equivalent
+#   ``tensor.to(device)`` calls
+# * ``cuda_tensor.nonzero()``
+# * python control flow which depends on results of operations performed on CUDA
+#   tensors e.g. ``if (cuda_tensor != 0).all()``
+#
+
+###############################################################################
+# Create tensors directly on the target device
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# Instead of calling ``torch.rand(size).cuda()`` to generate a random tensor,
+# produce the output directly on the target device:
+# ``torch.rand(size, device='cuda')``.
+#
+# This is applicable to all functions which create new tensors and accept
+# ``device`` argument:
+# `torch.rand() <https://pytorch.org/docs/stable/generated/torch.rand.html#torch.rand>`_,
+# `torch.zeros() <https://pytorch.org/docs/stable/generated/torch.zeros.html#torch.zeros>`_,
+# `torch.full() <https://pytorch.org/docs/stable/generated/torch.full.html#torch.full>`_
+# and similar.
+
+###############################################################################
+# Use mixed precision and AMP
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# Mixed precision leverages
+# `Tensor Cores <https://www.nvidia.com/en-us/data-center/tensor-cores/>`_
+# and offers up to 3x overall speedup on Volta and newer GPU architectures. To
+# use Tensor Cores AMP should be enabled and matrix/tensor dimensions should
+# satisfy requirements for calling kernels that use Tensor Cores.
+#
+# To use Tensor Cores:
+#
+# * set sizes to multiples of 8 (to map onto dimensions of Tensor Cores)
+#
+#   * see
+#     `Deep Learning Performance Documentation
+#     <https://docs.nvidia.com/deeplearning/performance/index.html#optimizing-performance>`_
+#     for more details and guidelines specific to layer type
+#   * if layer size is derived from other parameters rather than fixed, it can
+#     still be explicitly padded e.g. vocabulary size in NLP models
+#
+# * enable AMP
+#
+#   * Introduction to Mixed Precision Training and AMP:
+#     `slides <https://nvlabs.github.io/eccv2020-mixed-precision-tutorial/files/dusan_stosic-training-neural-networks-with-tensor-cores.pdf>`_
+#   * native PyTorch AMP is available:
+#     `documentation <https://pytorch.org/docs/stable/amp.html>`_,
+#     `examples <https://pytorch.org/docs/stable/notes/amp_examples.html#amp-examples>`_,
+#     `tutorial <https://pytorch.org/tutorials/recipes/recipes/amp_recipe.html>`_
+#
+#
+
+###############################################################################
+# Preallocate memory in case of variable input length
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# Models for speech recognition or for NLP are often trained on input tensors
+# with variable sequence length. Variable length can be problematic for PyTorch
+# caching allocator and can lead to reduced performance or to unexpected
+# out-of-memory errors. If a batch with a short sequence length is followed by
+# an another batch with longer sequence length, then PyTorch is forced to
+# release intermediate buffers from previous iteration and to re-allocate new
+# buffers. This process is time consuming and causes fragmentation in the
+# caching allocator which may result in out-of-memory errors.
+#
+# A typical solution is to implement preallocation. It consists of the
+# following steps:
+#
+# #. generate a (usually random) batch of inputs with maximum sequence length
+#    (either corresponding to max length in the training dataset or to some
+#    predefined threshold)
+# #. execute a forward and a backward pass with the generated batch, do not
+#    execute an optimizer or a learning rate scheduler, this step preallocates
+#    buffers of maximum size, which can be reused in subsequent
+#    training iterations
+# #. zero out gradients
+# #. proceed to regular training
+#
+
+###############################################################################
+# Distributed optimizations
+# -------------------------
+
+###############################################################################
+# Use efficient data-parallel backend
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# PyTorch has two ways to implement data-parallel training:
+#
+# * `torch.nn.DataParallel <https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html#torch.nn.DataParallel>`_
+# * `torch.nn.parallel.DistributedDataParallel <https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel>`_
+#
+# ``DistributedDataParallel`` offers much better performance and scaling to
+# multiple-GPUs. For more information refer to the
+# `relevant section of CUDA Best Practices <https://pytorch.org/docs/stable/notes/cuda.html#use-nn-parallel-distributeddataparallel-instead-of-multiprocessing-or-nn-dataparallel>`_
+# from PyTorch documentation.
+
+###############################################################################
+# Skip unnecessary all-reduce if training with ``DistributedDataParallel`` and gradient accumulation
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# By default
+# `torch.nn.parallel.DistributedDataParallel <https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel>`_
+# executes gradient all-reduce after every backward pass to compute the average
+# gradient over all workers participating in the training. If training uses
+# gradient accumulation over N steps, then all-reduce is not necessary after
+# every training step, it's only required to perform all-reduce after the last
+# call to backward, just before the execution of the optimizer.
+#
+# ``DistributedDataParallel`` provides
+# `no_sync() <https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel.no_sync>`_
+# context manager which disables gradient all-reduce for particular iteration.
+# ``no_sync()`` should be applied to first ``N-1`` iterations of gradient
+# accumulation, the last iteration should follow the default execution and
+# perform the required gradient all-reduce.
+
+###############################################################################
+# Match the order of layers in constructors and during the execution if using ``DistributedDataParallel(find_unused_parameters=True)``
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# `torch.nn.parallel.DistributedDataParallel <https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel>`_
+# with ``find_unused_parameters=True`` uses the order of layers and parameters
+# from model constructors to build buckets for ``DistributedDataParallel``
+# gradient all-reduce. ``DistributedDataParallel`` overlaps all-reduce with the
+# backward pass. All-reduce for a particular bucket is asynchronously triggered
+# only when all gradients for parameters in a given bucket are available.
+#
+# To maximize the amount of overlap, the order in model constructors should
+# roughly match the order during the execution. If the order doesn't match, then
+# all-reduce for the entire bucket waits for the gradient which is the last to
+# arrive, this may reduce the overlap between backward pass and all-reduce,
+# all-reduce may end up being exposed, which slows down the training.
+#
+# ``DistributedDataParallel`` with ``find_unused_parameters=False`` (which is
+# the default setting) relies on automatic bucket formation based on order of
+# operations encountered during the backward pass. With
+# ``find_unused_parameters=False`` it's not necessary to reorder layers or
+# parameters to achieve optimal performance.
+
+###############################################################################
+# Load-balance workload in a distributed setting
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# Load imbalance typically may happen for models processing sequential data
+# (speech recognition, translation, language models etc.). If one device
+# receives a batch of data with sequence length longer than sequence lengths for
+# the remaining devices, then all devices wait for the worker which finishes
+# last. Backward pass functions as an implicit synchronization point in a
+# distributed setting with
+# `DistributedDataParallel <https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel>`_
+# backend.
+#
+# There are multiple ways to solve the load balancing problem. The core idea is
+# to distribute workload over all workers as uniformly as possible within each
+# global batch. For example Transformer solves imbalance by forming batches with
+# approximately constant number of tokens (and variable number of sequences in a
+# batch), other models solve imbalance by bucketing samples with similar
+# sequence length or even by sorting dataset by sequence length.
+
+###############################################################################
+# Conclusion
+# ----------
+# 
+# This tutorial covered a comprehensive set of performance optimization techniques
+# for PyTorch models. The key takeaways include:
+#
+# * **General optimizations**: Enable async data loading, disable gradients for
+#   inference, fuse operations with ``torch.compile``, and use efficient memory formats
+# * **CPU optimizations**: Leverage NUMA controls, optimize OpenMP settings, and
+#   use efficient memory allocators
+# * **GPU optimizations**: Enable Tensor cores, use CUDA graphs, enable cuDNN
+#   autotuner, and implement mixed precision training
+# * **Distributed optimizations**: Use DistributedDataParallel, optimize gradient
+#   synchronization, and balance workloads across devices
+#
+# Many of these optimizations can be applied with minimal code changes and provide
+# significant performance improvements across a wide range of deep learning models.
+#
+# Further Reading
+# ---------------
+#
+# * `PyTorch Performance Tuning Documentation <https://pytorch.org/tutorials/recipes/recipes/tuning_guide.html>`_
+# * `CUDA Best Practices <https://pytorch.org/docs/stable/notes/cuda.html>`_
+# * `Distributed Training Documentation <https://pytorch.org/tutorials/intermediate/ddp_tutorial.html>`_
+# * `Mixed Precision Training <https://pytorch.org/docs/stable/amp.html>`_
+# * `torch.compile Tutorial <https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html>`_
diff --git a/recipes_source/recipes/warmstarting_model_using_parameters_from_a_different_model.py b/recipes_source/recipes/warmstarting_model_using_parameters_from_a_different_model.py
new file mode 100644
index 00000000000..a0752bfc67d
--- /dev/null
+++ b/recipes_source/recipes/warmstarting_model_using_parameters_from_a_different_model.py
@@ -0,0 +1,142 @@
+"""
+Warmstarting model using parameters from a different model in PyTorch
+=====================================================================
+Partially loading a model or loading a partial model are common
+scenarios when transfer learning or training a new complex model.
+Leveraging trained parameters, even if only a few are usable, will help
+to warmstart the training process and hopefully help your model converge
+much faster than training from scratch.
+
+Introduction
+------------
+Whether you are loading from a partial ``state_dict``, which is missing
+some keys, or loading a ``state_dict`` with more keys than the model
+that you are loading into, you can set the strict argument to ``False``
+in the ``load_state_dict()`` function to ignore non-matching keys.
+In this recipe, we will experiment with warmstarting a model using
+parameters of a different model.
+
+Setup
+-----
+Before we begin, we need to install ``torch`` if it isn’t already
+available.
+
+.. code-block:: sh
+
+   pip install torch
+   
+"""
+
+
+
+######################################################################
+# Steps
+# -----
+# 
+# 1. Import all necessary libraries for loading our data
+# 2. Define and initialize the neural network A and B
+# 3. Save model A
+# 4. Load into model B
+# 
+# 1. Import necessary libraries for loading our data
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# 
+# For this recipe, we will use ``torch`` and its subsidiaries ``torch.nn``
+# and ``torch.optim``.
+# 
+
+import torch
+import torch.nn as nn
+import torch.optim as optim
+
+
+######################################################################
+# 2. Define and initialize the neural network A and B
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# 
+# For sake of example, we will create a neural network for training
+# images. To learn more see the Defining a Neural Network recipe. We will
+# create two neural networks for sake of loading one parameter of type A
+# into type B.
+# 
+
+class NetA(nn.Module):
+    def __init__(self):
+        super(NetA, self).__init__()
+        self.conv1 = nn.Conv2d(3, 6, 5)
+        self.pool = nn.MaxPool2d(2, 2)
+        self.conv2 = nn.Conv2d(6, 16, 5)
+        self.fc1 = nn.Linear(16 * 5 * 5, 120)
+        self.fc2 = nn.Linear(120, 84)
+        self.fc3 = nn.Linear(84, 10)
+
+    def forward(self, x):
+        x = self.pool(F.relu(self.conv1(x)))
+        x = self.pool(F.relu(self.conv2(x)))
+        x = x.view(-1, 16 * 5 * 5)
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        x = self.fc3(x)
+        return x
+
+netA = NetA()
+
+class NetB(nn.Module):
+    def __init__(self):
+        super(NetB, self).__init__()
+        self.conv1 = nn.Conv2d(3, 6, 5)
+        self.pool = nn.MaxPool2d(2, 2)
+        self.conv2 = nn.Conv2d(6, 16, 5)
+        self.fc1 = nn.Linear(16 * 5 * 5, 120)
+        self.fc2 = nn.Linear(120, 84)
+        self.fc3 = nn.Linear(84, 10)
+
+    def forward(self, x):
+        x = self.pool(F.relu(self.conv1(x)))
+        x = self.pool(F.relu(self.conv2(x)))
+        x = x.view(-1, 16 * 5 * 5)
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        x = self.fc3(x)
+        return x
+
+netB = NetB()
+
+
+######################################################################
+# 3. Save model A
+# ~~~~~~~~~~~~~~~~~~~
+# 
+
+# Specify a path to save to
+PATH = "model.pt"
+
+torch.save(netA.state_dict(), PATH)
+
+
+######################################################################
+# 4. Load into model B
+# ~~~~~~~~~~~~~~~~~~~~~~~~
+# 
+# If you want to load parameters from one layer to another, but some keys
+# do not match, simply change the name of the parameter keys in the
+# state_dict that you are loading to match the keys in the model that you
+# are loading into.
+# 
+
+netB.load_state_dict(torch.load(PATH, weights_only=True), strict=False)
+
+
+######################################################################
+# You can see that all keys matched successfully!
+# 
+# Congratulations! You have successfully warmstarted a model using
+# parameters from a different model in PyTorch.
+# 
+# Learn More
+# ----------
+# 
+# Take a look at these other recipes to continue your learning:
+# 
+# - `Saving and loading multiple models in one file using PyTorch <https://pytorch.org/tutorials/recipes/recipes/saving_multiple_models_in_one_file.html>`__
+# - `Saving and loading models across devices in PyTorch <https://pytorch.org/tutorials/recipes/recipes/save_load_across_devices.html>`__
diff --git a/recipes_source/recipes/what_is_state_dict.py b/recipes_source/recipes/what_is_state_dict.py
new file mode 100644
index 00000000000..bd9b1d31b62
--- /dev/null
+++ b/recipes_source/recipes/what_is_state_dict.py
@@ -0,0 +1,133 @@
+"""
+What is a state_dict in PyTorch
+===============================
+In PyTorch, the learnable parameters (i.e. weights and biases) of a
+``torch.nn.Module`` model are contained in the model’s parameters
+(accessed with ``model.parameters()``). A ``state_dict`` is simply a
+Python dictionary object that maps each layer to its parameter tensor.
+
+Introduction
+------------
+A ``state_dict`` is an integral entity if you are interested in saving
+or loading models from PyTorch.
+Because ``state_dict`` objects are Python dictionaries, they can be
+easily saved, updated, altered, and restored, adding a great deal of
+modularity to PyTorch models and optimizers.
+Note that only layers with learnable parameters (convolutional layers,
+linear layers, etc.) and registered buffers (batchnorm’s running_mean)
+have entries in the model’s ``state_dict``. Optimizer objects
+(``torch.optim``) also have a ``state_dict``, which contains information
+about the optimizer’s state, as well as the hyperparameters used.
+In this recipe, we will see how ``state_dict`` is used with a simple
+model.
+
+Setup
+-----
+Before we begin, we need to install ``torch`` if it isn’t already
+available.
+
+.. code-block:: sh
+
+   pip install torch
+
+"""
+
+
+
+######################################################################
+# Steps
+# -----
+# 
+# 1. Import all necessary libraries for loading our data
+# 2. Define and initialize the neural network
+# 3. Initialize the optimizer
+# 4. Access the model and optimizer ``state_dict``
+# 
+# 1. Import necessary libraries for loading our data
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# 
+# For this recipe, we will use ``torch`` and its subsidiaries ``torch.nn``
+# and ``torch.optim``.
+# 
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+
+
+######################################################################
+# 2. Define and initialize the neural network
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# 
+# For sake of example, we will create a neural network for training
+# images. To learn more see the Defining a Neural Network recipe.
+# 
+
+class Net(nn.Module):
+    def __init__(self):
+        super(Net, self).__init__()
+        self.conv1 = nn.Conv2d(3, 6, 5)
+        self.pool = nn.MaxPool2d(2, 2)
+        self.conv2 = nn.Conv2d(6, 16, 5)
+        self.fc1 = nn.Linear(16 * 5 * 5, 120)
+        self.fc2 = nn.Linear(120, 84)
+        self.fc3 = nn.Linear(84, 10)
+
+    def forward(self, x):
+        x = self.pool(F.relu(self.conv1(x)))
+        x = self.pool(F.relu(self.conv2(x)))
+        x = x.view(-1, 16 * 5 * 5)
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        x = self.fc3(x)
+        return x
+
+net = Net()
+print(net)
+
+
+######################################################################
+# 3. Initialize the optimizer
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# 
+# We will use SGD with momentum.
+# 
+
+optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
+
+
+######################################################################
+# 4. Access the model and optimizer ``state_dict``
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# 
+# Now that we have constructed our model and optimizer, we can understand
+# what is preserved in their respective ``state_dict`` properties.
+# 
+
+# Print model's state_dict
+print("Model's state_dict:")
+for param_tensor in net.state_dict():
+    print(param_tensor, "\t", net.state_dict()[param_tensor].size())
+
+print()
+
+# Print optimizer's state_dict
+print("Optimizer's state_dict:")
+for var_name in optimizer.state_dict():
+    print(var_name, "\t", optimizer.state_dict()[var_name])
+
+
+######################################################################
+# This information is relevant for saving and loading the model and
+# optimizers for future use.
+# 
+# Congratulations! You have successfully used ``state_dict`` in PyTorch.
+# 
+# Learn More
+# ----------
+# 
+# Take a look at these other recipes to continue your learning:
+# 
+# - `Saving and loading models for inference in PyTorch <https://pytorch.org/tutorials/recipes/recipes/saving_and_loading_models_for_inference.html>`__
+# - `Saving and loading a general checkpoint in PyTorch <https://pytorch.org/tutorials/recipes/recipes/saving_and_loading_a_general_checkpoint.html>`__
diff --git a/recipes_source/recipes/zeroing_out_gradients.py b/recipes_source/recipes/zeroing_out_gradients.py
new file mode 100644
index 00000000000..a4f80354961
--- /dev/null
+++ b/recipes_source/recipes/zeroing_out_gradients.py
@@ -0,0 +1,193 @@
+"""
+Zeroing out gradients in PyTorch
+================================
+It is beneficial to zero out gradients when building a neural network.
+This is because by default, gradients are accumulated in buffers (i.e,
+not overwritten) whenever ``.backward()`` is called.
+
+Introduction
+------------
+When training your neural network, models are able to increase their
+accuracy through gradient descent. In short, gradient descent is the
+process of minimizing our loss (or error) by tweaking the weights and
+biases in our model.
+
+``torch.Tensor`` is the central class of PyTorch. When you create a
+tensor, if you set its attribute ``.requires_grad`` as ``True``, the
+package tracks all operations on it. This happens on subsequent backward
+passes. The gradient for this tensor will be accumulated into ``.grad``
+attribute. The accumulation (or sum) of all the gradients is calculated
+when .backward() is called on the loss tensor.
+
+There are cases where it may be necessary to zero-out the gradients of a
+tensor. For example: when you start your training loop, you should zero
+out the gradients so that you can perform this tracking correctly.
+In this recipe, we will learn how to zero out gradients using the
+PyTorch library. We will demonstrate how to do this by training a neural
+network on the ``CIFAR10`` dataset built into PyTorch.
+
+Setup
+-----
+Since we will be training data in this recipe, if you are in a runnable
+notebook, it is best to switch the runtime to GPU or TPU.
+Before we begin, we need to install ``torch`` and ``torchvision`` if
+they aren’t already available.
+
+.. code-block:: sh
+
+   pip install torchvision
+
+
+"""
+
+
+######################################################################
+# Steps
+# -----
+#
+# Steps 1 through 4 set up our data and neural network for training. The
+# process of zeroing out the gradients happens in step 5. If you already
+# have your data and neural network built, skip to 5.
+#
+# 1. Import all necessary libraries for loading our data
+# 2. Load and normalize the dataset
+# 3. Build the neural network
+# 4. Define the loss function
+# 5. Zero the gradients while training the network
+#
+# 1. Import necessary libraries for loading our data
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# For this recipe, we will just be using ``torch`` and ``torchvision`` to
+# access the dataset.
+#
+
+import torch
+
+import torch.nn as nn
+import torch.nn.functional as F
+
+import torch.optim as optim
+
+import torchvision
+import torchvision.transforms as transforms
+
+
+######################################################################
+# 2. Load and normalize the dataset
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# PyTorch features various built-in datasets (see the Loading Data recipe
+# for more information).
+#
+
+transform = transforms.Compose(
+    [transforms.ToTensor(),
+     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
+
+trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
+                                        download=True, transform=transform)
+trainloader = torch.utils.data.DataLoader(trainset, batch_size=4,
+                                          shuffle=True, num_workers=2)
+
+testset = torchvision.datasets.CIFAR10(root='./data', train=False,
+                                       download=True, transform=transform)
+testloader = torch.utils.data.DataLoader(testset, batch_size=4,
+                                         shuffle=False, num_workers=2)
+
+classes = ('plane', 'car', 'bird', 'cat',
+           'deer', 'dog', 'frog', 'horse', 'ship', 'truck')
+
+
+######################################################################
+# 3. Build the neural network
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# We will use a convolutional neural network. To learn more see the
+# Defining a Neural Network recipe.
+#
+
+class Net(nn.Module):
+    def __init__(self):
+        super(Net, self).__init__()
+        self.conv1 = nn.Conv2d(3, 6, 5)
+        self.pool = nn.MaxPool2d(2, 2)
+        self.conv2 = nn.Conv2d(6, 16, 5)
+        self.fc1 = nn.Linear(16 * 5 * 5, 120)
+        self.fc2 = nn.Linear(120, 84)
+        self.fc3 = nn.Linear(84, 10)
+
+    def forward(self, x):
+        x = self.pool(F.relu(self.conv1(x)))
+        x = self.pool(F.relu(self.conv2(x)))
+        x = x.view(-1, 16 * 5 * 5)
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        x = self.fc3(x)
+        return x
+
+
+######################################################################
+# 4. Define a Loss function and optimizer
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# Let’s use a Classification Cross-Entropy loss and SGD with momentum.
+#
+
+net = Net()
+criterion = nn.CrossEntropyLoss()
+optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
+
+
+######################################################################
+# 5. Zero the gradients while training the network
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# This is when things start to get interesting. We simply have to loop
+# over our data iterator, and feed the inputs to the network and optimize.
+#
+# Notice that for each entity of data, we zero out the gradients. This is
+# to ensure that we aren’t tracking any unnecessary information when we
+# train our neural network.
+#
+
+for epoch in range(2):  # loop over the dataset multiple times
+
+    running_loss = 0.0
+    for i, data in enumerate(trainloader, 0):
+        # get the inputs; data is a list of [inputs, labels]
+        inputs, labels = data
+
+        # zero the parameter gradients
+        optimizer.zero_grad()
+
+        # forward + backward + optimize
+        outputs = net(inputs)
+        loss = criterion(outputs, labels)
+        loss.backward()
+        optimizer.step()
+
+        # print statistics
+        running_loss += loss.item()
+        if i % 2000 == 1999:    # print every 2000 mini-batches
+            print('[%d, %5d] loss: %.3f' %
+                  (epoch + 1, i + 1, running_loss / 2000))
+            running_loss = 0.0
+
+print('Finished Training')
+
+
+######################################################################
+# You can also use ``model.zero_grad()``. This is the same as using
+# ``optimizer.zero_grad()`` as long as all your model parameters are in
+# that optimizer. Use your best judgment to decide which one to use.
+#
+# Congratulations! You have successfully zeroed out gradients in PyTorch.
+#
+# Learn More
+# ----------
+#
+# Take a look at these other recipes to continue your learning:
+#
+# - `Loading data in PyTorch <https://pytorch.org/tutorials/beginner/basics/data_tutorial.html>`__
+# - `Saving and loading models across devices in PyTorch <https://pytorch.org/tutorials/recipes/recipes/save_load_across_devices.html>`__
diff --git a/recipes_source/regional_aot.py b/recipes_source/regional_aot.py
new file mode 100644
index 00000000000..cee8465d9bc
--- /dev/null
+++ b/recipes_source/regional_aot.py
@@ -0,0 +1,241 @@
+
+"""
+Reducing AoT cold start compilation time with regional compilation
+============================================================================
+
+**Author:** `Sayak Paul <https://huggingface.co/sayakpaul>`_, `Charles Bensimon <https://huggingface.co/cbensimon>`_, `Angela Yi <https://github.com/angelayi>`_
+
+In the `regional compilation recipe <https://docs.pytorch.org/tutorials/recipes/regional_compilation.html>`__, we showed
+how to reduce cold start compilation times while retaining (almost) full compilation benefits. This was demonstrated for
+just-in-time (JIT) compilation.
+
+This recipe shows how to apply similar principles when compiling a model ahead-of-time (AoT). If you
+are not familiar with AOTInductor and ``torch.export``, we recommend you to check out `this tutorial <https://docs.pytorch.org/tutorials/recipes/torch_export_aoti_python.html>`__.
+
+Prerequisites
+----------------
+
+* Pytorch 2.6 or later
+* Familiarity with regional compilation
+* Familiarity with AOTInductor and ``torch.export``
+
+Setup
+-----
+Before we begin, we need to install ``torch`` if it is not already
+available.
+
+.. code-block:: sh
+
+   pip install torch
+"""
+
+######################################################################
+# Steps
+# -----
+#
+# In this recipe, we will follow the same steps as the regional compilation recipe mentioned above:
+#
+# 1. Import all necessary libraries.
+# 2. Define and initialize a neural network with repeated regions.
+# 3. Measure the compilation time of the full model and the regional compilation with AoT.
+#
+# First, let's import the necessary libraries for loading our data:
+#
+
+import torch
+torch.set_grad_enabled(False)
+
+from time import perf_counter
+
+###################################################################################
+# Defining the Neural Network
+# ---------------------------
+#
+# We will use the same neural network structure as the regional compilation recipe.
+#
+# We will use a network, composed of repeated layers. This mimics a
+# large language model, that typically is composed of many Transformer blocks. In this recipe,
+# we will create a ``Layer`` using the ``nn.Module`` class as a proxy for a repeated region.
+# We will then create a ``Model`` which is composed of 64 instances of this
+# ``Layer`` class.
+#
+class Layer(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.linear1 = torch.nn.Linear(10, 10)
+        self.relu1 = torch.nn.ReLU()
+        self.linear2 = torch.nn.Linear(10, 10)
+        self.relu2 = torch.nn.ReLU()
+
+    def forward(self, x):
+        a = self.linear1(x)
+        a = self.relu1(a)
+        a = torch.sigmoid(a)
+        b = self.linear2(a)
+        b = self.relu2(b)
+        return b
+
+
+class Model(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.linear = torch.nn.Linear(10, 10)
+        self.layers = torch.nn.ModuleList([Layer() for _ in range(64)])
+
+    def forward(self, x):
+        # In regional compilation, the self.linear is outside of the scope of ``torch.compile``.
+        x = self.linear(x)
+        for layer in self.layers:
+            x = layer(x)
+        return x
+
+
+##################################################################################
+# Compiling the model ahead-of-time
+# ---------------------------------
+#
+# Since we're compiling the model ahead-of-time, we need to prepare representative
+# input examples, that we expect the model to see during actual deployments.
+#
+# Let's create an instance of ``Model`` and pass it some sample input data.
+#
+
+model = Model().cuda()
+input = torch.randn(10, 10, device="cuda")
+output = model(input)
+print(f"{output.shape=}")
+
+###############################################################################################
+# Now, let's compile our model ahead-of-time. We will use ``input`` created above to pass
+# to ``torch.export``. This will yield a ``torch.export.ExportedProgram`` which we can compile.
+
+path = torch._inductor.aoti_compile_and_package(
+    torch.export.export(model, args=(input,))
+)
+
+#################################################################
+# We can load from this ``path`` and use it to perform inference.
+
+compiled_binary = torch._inductor.aoti_load_package(path)
+output_compiled = compiled_binary(input)
+print(f"{output_compiled.shape=}")
+
+######################################################################################
+# Compiling _regions_ of the model ahead-of-time
+# ----------------------------------------------
+#
+# Compiling model regions ahead-of-time, on the other hand, requires a few key changes.
+#
+# Since the compute pattern is shared by all the blocks that
+# are repeated in a model (``Layer`` instances in this cases), we can just
+# compile a single block and let the inductor reuse it.
+
+model = Model().cuda()
+path = torch._inductor.aoti_compile_and_package(
+    torch.export.export(model.layers[0], args=(input,)),
+    inductor_configs={
+        # compile artifact w/o saving params in the artifact
+        "aot_inductor.package_constants_in_so": False,
+    }
+)
+
+###################################################
+# An exported program (``torch.export.ExportedProgram``) contains the Tensor computation,
+# a ``state_dict`` containing tensor values of all lifted parameters and buffer alongside
+# other metadata. We specify the ``aot_inductor.package_constants_in_so`` to be ``False`` to
+# not serialize the model parameters in the generated artifact.
+#
+# Now, when loading the compiled binary, we can reuse the existing parameters of
+# each block. This lets us take advantage of the compiled binary obtained above.
+#
+
+for layer in model.layers:
+    compiled_layer = torch._inductor.aoti_load_package(path)
+    compiled_layer.load_constants(
+        layer.state_dict(), check_full_update=True, user_managed=True
+    )
+    layer.forward = compiled_layer
+
+output_regional_compiled = model(input)
+print(f"{output_regional_compiled.shape=}")
+
+#####################################################
+# Just like JIT regional compilation, compiling regions within a model ahead-of-time
+# leads to significantly reduced cold start times. The actual number will vary from
+# model to model.
+#
+# Even though full model compilation offers the fullest scope of optimizations,
+# for practical purposes and depending on the type of model, we have seen regional
+# compilation (both JiT and AoT) providing similar speed benefits, while drastically
+# reducing the cold start times.
+
+###################################################
+# Measuring compilation time
+# --------------------------
+# Next, let's measure the compilation time of the full model and the regional compilation.
+#
+
+def measure_compile_time(input, regional=False):
+    start = perf_counter()
+    model = aot_compile_load_model(regional=regional)
+    torch.cuda.synchronize()
+    end = perf_counter()
+    # make sure the model works.
+    _ = model(input)
+    return end - start
+
+def aot_compile_load_model(regional=False) -> torch.nn.Module:
+    input = torch.randn(10, 10, device="cuda")
+    model = Model().cuda()
+
+    inductor_configs = {}
+    if regional:
+        inductor_configs = {"aot_inductor.package_constants_in_so": False}
+
+    # Reset the compiler caches to ensure no reuse between different runs
+    torch.compiler.reset()
+    with torch._inductor.utils.fresh_inductor_cache():
+        path = torch._inductor.aoti_compile_and_package(
+            torch.export.export(
+                model.layers[0] if regional else model,
+                args=(input,)
+            ),
+            inductor_configs=inductor_configs,
+        )
+
+        if regional:
+            for layer in model.layers:
+                compiled_layer = torch._inductor.aoti_load_package(path)
+                compiled_layer.load_constants(
+                    layer.state_dict(), check_full_update=True, user_managed=True
+                )
+                layer.forward = compiled_layer
+        else:
+            model = torch._inductor.aoti_load_package(path)
+    return model
+
+input = torch.randn(10, 10, device="cuda")
+full_model_compilation_latency = measure_compile_time(input, regional=False)
+print(f"Full model compilation time = {full_model_compilation_latency:.2f} seconds")
+
+regional_compilation_latency = measure_compile_time(input, regional=True)
+print(f"Regional compilation time = {regional_compilation_latency:.2f} seconds")
+
+assert regional_compilation_latency < full_model_compilation_latency
+
+############################################################################
+# There may also be layers in a model incompatible with compilation. So,
+# full compilation will result in a fragmented computation graph resulting
+# in potential latency degradation. In these case, regional compilation
+# can be beneficial.
+#
+
+############################################################################
+# Conclusion
+# -----------
+#
+# This recipe shows how to control the cold start time when compiling your
+# model ahead-of-time. This becomes effective when your model has repeated
+# blocks, which is typically seen in large generative models. We used this
+# recipe on various models to speed up real-time performance. Learn more
+# `here <https://huggingface.co/blog/zerogpu-aoti>`__.
diff --git a/recipes_source/regional_compilation.py b/recipes_source/regional_compilation.py
new file mode 100644
index 00000000000..0a665e04b52
--- /dev/null
+++ b/recipes_source/regional_compilation.py
@@ -0,0 +1,178 @@
+"""
+Reducing torch.compile cold start compilation time with regional compilation
+============================================================================
+
+**Author:** `Animesh Jain <https://github.com/anijain2305>`_
+
+As deep learning models get larger, the compilation time of these models also
+increases. This extended compilation time can result in a large startup time in
+inference services or wasted resources in large-scale training. This recipe
+shows an example of how to reduce the cold start compilation time by choosing to
+compile a repeated region of the model instead of the entire model.
+
+Prerequisites
+----------------
+
+* Pytorch 2.5 or later
+
+Setup
+-----
+Before we begin, we need to install ``torch`` if it is not already
+available.
+
+.. code-block:: sh
+
+   pip install torch
+
+.. note::
+   This feature is available starting with the 2.5 release. If you are using version 2.4,
+   you can enable the configuration flag ``torch._dynamo.config.inline_inbuilt_nn_modules=True``
+   to prevent recompilations during regional compilation. In version 2.5, this flag is enabled by default.
+"""
+
+from time import perf_counter
+
+######################################################################
+# Steps
+# -----
+#
+# In this recipe, we will follow these steps:
+#
+# 1. Import all necessary libraries.
+# 2. Define and initialize a neural network with repeated regions.
+# 3. Understand the difference between the full model and the regional compilation.
+# 4. Measure the compilation time of the full model and the regional compilation.
+#
+# First, let's import the necessary libraries for loading our data:
+#
+#
+#
+
+import torch
+import torch.nn as nn
+
+
+##########################################################
+# Next, let's define and initialize a neural network with repeated regions.
+#
+# Typically, neural networks are composed of repeated layers. For example, a
+# large language model is composed of many Transformer blocks. In this recipe,
+# we will create a ``Layer`` using the ``nn.Module`` class as a proxy for a repeated region.
+# We will then create a ``Model`` which is composed of 64 instances of this
+# ``Layer`` class.
+#
+class Layer(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.linear1 = torch.nn.Linear(10, 10)
+        self.relu1 = torch.nn.ReLU()
+        self.linear2 = torch.nn.Linear(10, 10)
+        self.relu2 = torch.nn.ReLU()
+
+    def forward(self, x):
+        a = self.linear1(x)
+        a = self.relu1(a)
+        a = torch.sigmoid(a)
+        b = self.linear2(a)
+        b = self.relu2(b)
+        return b
+
+
+class Model(torch.nn.Module):
+    def __init__(self, apply_regional_compilation):
+        super().__init__()
+        self.linear = torch.nn.Linear(10, 10)
+        # Apply compile only to the repeated layers.
+        if apply_regional_compilation:
+            self.layers = torch.nn.ModuleList(
+                [torch.compile(Layer()) for _ in range(64)]
+            )
+        else:
+            self.layers = torch.nn.ModuleList([Layer() for _ in range(64)])
+
+    def forward(self, x):
+        # In regional compilation, the self.linear is outside of the scope of `torch.compile`.
+        x = self.linear(x)
+        for layer in self.layers:
+            x = layer(x)
+        return x
+
+
+####################################################
+# Next, let's review the difference between the full model and the regional compilation.
+#
+# In full model compilation, the entire model is compiled as a whole. This is the common approach
+# most users take with ``torch.compile``. In this example, we apply ``torch.compile`` to
+# the ``Model`` object. This will effectively inline the 64 layers, producing a
+# large graph to compile. You can look at the full graph by running this recipe
+# with ``TORCH_LOGS=graph_code``.
+#
+#
+
+model = Model(apply_regional_compilation=False).cuda()
+full_compiled_model = torch.compile(model)
+
+
+###################################################
+# The regional compilation, on the other hand, compiles a region of the model.
+# By strategically choosing to compile a repeated region of the model, we can compile a
+# much smaller graph and then reuse the compiled graph for all the regions.
+# In the example, ``torch.compile`` is applied only to the ``layers`` and not the full model.
+#
+
+regional_compiled_model = Model(apply_regional_compilation=True).cuda()
+
+#####################################################
+# Applying compilation to a repeated region, instead of full model, leads to
+# large savings in compile time. Here, we will just compile a layer instance and
+# then reuse it 64 times in the ``Model`` object.
+#
+# Note that with repeated regions, some part of the model might not be compiled.
+# For example, the ``self.linear`` in the ``Model`` is outside of the scope of
+# regional compilation.
+#
+# Also, note that there is a tradeoff between performance speedup and compile
+# time. Full model compilation involves a larger graph and,
+# theoretically, offers more scope for optimizations. However, for practical
+# purposes and depending on the model, we have observed many cases with minimal
+# speedup differences between the full model and regional compilation.
+
+
+###################################################
+# Next, let's measure the compilation time of the full model and the regional compilation.
+#
+# ``torch.compile`` is a JIT compiler, which means that it compiles on the first invocation.
+# In the code below, we measure the total time spent in the first invocation. While this method is not
+# precise, it provides a good estimate since the majority of the time is spent in
+# compilation.
+
+
+def measure_latency(fn, input):
+    # Reset the compiler caches to ensure no reuse between different runs
+    torch.compiler.reset()
+    with torch._inductor.utils.fresh_inductor_cache():
+        start = perf_counter()
+        fn(input)
+        torch.cuda.synchronize()
+        end = perf_counter()
+        return end - start
+
+
+input = torch.randn(10, 10, device="cuda")
+full_model_compilation_latency = measure_latency(full_compiled_model, input)
+print(f"Full model compilation time = {full_model_compilation_latency:.2f} seconds")
+
+regional_compilation_latency = measure_latency(regional_compiled_model, input)
+print(f"Regional compilation time = {regional_compilation_latency:.2f} seconds")
+
+assert regional_compilation_latency < full_model_compilation_latency
+
+############################################################################
+# Conclusion
+# -----------
+#
+# This recipe shows how to control the cold start compilation time if your model
+# has repeated regions. This approach requires user modifications to apply `torch.compile` to
+# the repeated regions instead of more commonly used full model compilation. We
+# are continually working on reducing cold start compilation time.
+#
diff --git a/recipes_source/script_optimized.rst b/recipes_source/script_optimized.rst
new file mode 100644
index 00000000000..ed64419ff41
--- /dev/null
+++ b/recipes_source/script_optimized.rst
@@ -0,0 +1,11 @@
+Script and Optimize for Mobile Recipe
+=====================================
+
+This tutorial has been deprecated. There is a new tutorial on this topic.
+
+Redirecting in 3 seconds...
+
+.. raw:: html
+
+   <meta http-equiv="Refresh" content="3; url='https://pytorch.org/executorch/stable/tutorials/export-to-executorch-tutorial.html'" />
+
diff --git a/recipes_source/torch_compile_caching_configuration_tutorial.rst b/recipes_source/torch_compile_caching_configuration_tutorial.rst
new file mode 100644
index 00000000000..21565d0562d
--- /dev/null
+++ b/recipes_source/torch_compile_caching_configuration_tutorial.rst
@@ -0,0 +1,78 @@
+Compile Time Caching Configuration
+=========================================================
+**Authors:** `Oguz Ulgen <https://github.com/oulgen>`_ and `Sam Larsen <https://github.com/masnesral>`_
+
+Introduction
+------------------
+
+PyTorch Compiler implements several caches to reduce compilation latency.
+This recipe demonstrates how you can configure various parts of the caching in ``torch.compile``.
+
+Prerequisites
+-------------------
+
+Before starting this recipe, make sure that you have the following:
+
+* Basic understanding of ``torch.compile``. See:
+
+  * `torch.compiler API documentation <https://pytorch.org/docs/stable/torch.compiler.html#torch-compiler>`__
+  * `Introduction to torch.compile <https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html>`__
+  * `Compile Time Caching in torch.compile <https://pytorch.org/tutorials/recipes/torch_compile_caching_tutorial.html>`__
+
+* PyTorch 2.4 or later
+
+Inductor Cache Settings
+----------------------------
+
+Most of these caches are in-memory, only used within the same process, and are transparent to the user. An exception is caches that store compiled FX graphs (``FXGraphCache``, ``AOTAutogradCache``). These caches allow Inductor to avoid recompilation across process boundaries when it encounters the same graph with the same Tensor input shapes (and the same configuration). The default implementation stores compiled artifacts in the system temp directory. An optional feature also supports sharing those artifacts within a cluster by storing them in a Redis database.
+
+There are a few settings relevant to caching and to FX graph caching in particular.
+The settings are accessible via environment variables listed below or can be hard-coded in the Inductor’s config file.
+
+TORCHINDUCTOR_FX_GRAPH_CACHE
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+This setting enables the local FX graph cache feature, which stores artifacts in the host’s temp directory. Setting it to ``1`` enables the feature while any other value disables it. By default, the disk location is per username, but users can enable sharing across usernames by specifying ``TORCHINDUCTOR_CACHE_DIR`` (below).
+
+TORCHINDUCTOR_AUTOGRAD_CACHE
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+This setting extends ``FXGraphCache`` to store cached results at the ``AOTAutograd`` level, rather than at the Inductor level. Setting it to ``1`` enables this feature, while any other value disables it.
+By default, the disk location is per username, but users can enable sharing across usernames by specifying ``TORCHINDUCTOR_CACHE_DIR`` (below).
+``TORCHINDUCTOR_AUTOGRAD_CACHE`` requires ``TORCHINDUCTOR_FX_GRAPH_CACHE`` to work. The same cache dir stores cache entries for ``AOTAutogradCache`` (under ``{TORCHINDUCTOR_CACHE_DIR}/aotautograd``) and ``FXGraphCache`` (under ``{TORCHINDUCTOR_CACHE_DIR}/fxgraph``).
+
+TORCHINDUCTOR_CACHE_DIR
+~~~~~~~~~~~~~~~~~~~~~~~~
+This setting specifies the location of all on-disk caches. By default, the location is in the system temp directory under ``torchinductor_<username>``, for example, ``/tmp/torchinductor_myusername``.
+
+Note that if ``TRITON_CACHE_DIR`` is not set in the environment, Inductor sets the ``Triton`` cache directory to this same temp location, under the Triton sub-directory.
+
+TORCHINDUCTOR_FX_GRAPH_REMOTE_CACHE
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+This setting enables the remote FX graph cache feature. The current implementation uses ``Redis``. ``1`` enables caching, and any other value disables it. The following environment variables configure the host and port of the Redis server:
+
+``TORCHINDUCTOR_REDIS_HOST`` (defaults to ``localhost``)
+``TORCHINDUCTOR_REDIS_PORT`` (defaults to ``6379``)
+
+.. note::
+
+    Note that if Inductor locates a remote cache entry, it stores the compiled artifact in the local on-disk cache; that local artifact would be served on subsequent runs on the same machine.
+
+TORCHINDUCTOR_AUTOGRAD_REMOTE_CACHE
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Similar to ``TORCHINDUCTOR_FX_GRAPH_REMOTE_CACHE``, this setting enables the remote ``AOTAutogradCache`` feature. The current implementation uses Redis. Setting it to ``1`` enables caching, while any other value disables it. The following environment variables are used to configure the host and port of the ``Redis`` server:
+* ``TORCHINDUCTOR_REDIS_HOST`` (defaults to ``localhost``)
+* ``TORCHINDUCTOR_REDIS_PORT`` (defaults to ``6379``)
+
+`TORCHINDUCTOR_AUTOGRAD_REMOTE_CACHE`` requires ``TORCHINDUCTOR_FX_GRAPH_REMOTE_CACHE`` to be enabled in order to function. The same Redis server can be used to store both AOTAutograd and FXGraph cache results.
+
+TORCHINDUCTOR_AUTOTUNE_REMOTE_CACHE
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+This setting enables a remote cache for ``TorchInductor``’s autotuner. Similar to remote FX graph cache, the current implementation uses Redis. Setting it to ``1`` enables caching, while any other value disables it. The same host / port environment variables mentioned above apply to this cache.
+
+TORCHINDUCTOR_FORCE_DISABLE_CACHES
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Set this value to ``1`` to disable all Inductor caching. This setting is useful for tasks like experimenting with cold-start compile times or forcing recompilation for debugging purposes.
+
+Conclusion
+-------------
+In this recipe, we have learned how to configure PyTorch Compiler's caching mechanisms. Additionally, we explored the various settings and environment variables that allow users to configure and optimize these caching features according to their specific needs.
+
diff --git a/recipes_source/torch_compile_caching_tutorial.rst b/recipes_source/torch_compile_caching_tutorial.rst
new file mode 100644
index 00000000000..e846817cbc0
--- /dev/null
+++ b/recipes_source/torch_compile_caching_tutorial.rst
@@ -0,0 +1,107 @@
+Compile Time Caching in ``torch.compile``
+=========================================================
+**Author:** `Oguz Ulgen <https://github.com/oulgen>`_
+
+Introduction
+------------------
+
+PyTorch Compiler provides several caching offerings to reduce compilation latency.
+This recipe will explain these offerings in detail to help users pick the best option for their use case.
+
+Check out `Compile Time Caching Configurations <https://pytorch.org/tutorials/recipes/torch_compile_caching_configuration_tutorial.html>`__ for how to configure these caches.
+
+Also check out our caching benchmark at `PT CacheBench Benchmarks <https://hud.pytorch.org/benchmark/llms?repoName=pytorch%2Fpytorch&benchmarkName=TorchCache+Benchmark>`__.
+
+Prerequisites
+-------------------
+
+Before starting this recipe, make sure that you have the following:
+
+* Basic understanding of ``torch.compile``. See:
+
+  * `torch.compiler API documentation <https://pytorch.org/docs/stable/torch.compiler.html#torch-compiler>`__
+  * `Introduction to torch.compile <https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html>`__
+  * `Triton language documentation <https://triton-lang.org/main/index.html>`__
+
+* PyTorch 2.4 or later
+
+Caching Offerings
+---------------------
+
+``torch.compile`` provides the following caching offerings:
+
+* End to end caching (also known as ``Mega-Cache``)
+* Modular caching of ``TorchDynamo``, ``TorchInductor``, and ``Triton``
+
+It is important to note that caching validates that the cache artifacts are used with the same PyTorch and Triton version, as well as, same GPU when device is set to be cuda.
+
+``torch.compile`` end-to-end caching (``Mega-Cache``)
+------------------------------------------------------------
+
+End to end caching, from here onwards referred to ``Mega-Cache``, is the ideal solution for users looking for a portable caching solution that can be stored in a database and can later be fetched possibly on a separate machine.
+
+``Mega-Cache`` provides two compiler APIs:
+
+* ``torch.compiler.save_cache_artifacts()``
+* ``torch.compiler.load_cache_artifacts()``
+
+The intended use case is after compiling and executing a model, the user calls ``torch.compiler.save_cache_artifacts()`` which will return the compiler artifacts in a portable form. Later, potentially on a different machine, the user may call ``torch.compiler.load_cache_artifacts()`` with these artifacts to pre-populate the ``torch.compile`` caches in order to jump-start their cache.
+
+Consider the following example. First, compile and save the cache artifacts.
+
+.. code-block:: python
+
+    @torch.compile
+    def fn(x, y):
+        return x.sin() @ y
+
+    a = torch.rand(100, 100, dtype=dtype, device=device)
+    b = torch.rand(100, 100, dtype=dtype, device=device)
+
+    result = fn(a, b)
+
+    artifacts = torch.compiler.save_cache_artifacts()
+
+    assert artifacts is not None
+    artifact_bytes, cache_info = artifacts
+
+    # Now, potentially store artifact_bytes in a database
+    # You can use cache_info for logging
+
+Later, you can jump-start the cache by the following:
+
+.. code-block:: python 
+
+    # Potentially download/fetch the artifacts from the database
+    torch.compiler.load_cache_artifacts(artifact_bytes)
+
+This operation populates all the modular caches that will be discussed in the next section, including ``PGO``, ``AOTAutograd``, ``Inductor``, ``Triton``, and ``Autotuning``.
+
+
+Modular caching of ``TorchDynamo``, ``TorchInductor``, and ``Triton``
+-----------------------------------------------------------
+
+The aforementioned ``Mega-Cache`` is composed of individual components that can be used without any user intervention. By default, PyTorch Compiler comes with local on-disk caches for ``TorchDynamo``, ``TorchInductor``, and ``Triton``. These caches include:
+
+* ``FXGraphCache``: A cache of graph-based IR components used in compilation.
+* ``TritonCache``: A cache of Triton-compilation results, including ``cubin`` files generated by ``Triton`` and other caching artifacts.
+* ``InductorCache``: A bundle of ``FXGraphCache`` and ``Triton`` cache.
+* ``AOTAutogradCache``: A cache of joint graph artifacts.
+* ``PGO-cache``: A cache of dynamic shape decisions to reduce number of recompilations.
+* `AutotuningCache <https://github.com/pytorch/pytorch/blob/795a6a0affd349adfb4e3df298b604b74f27b44e/torch/_inductor/runtime/autotune_cache.py#L116>`__: 
+    * ``Inductor`` generates ``Triton`` kernels and benchmarks them to select the fastest kernels.
+    * ``torch.compile``'s built-in ``AutotuningCache`` caches these results.
+
+All these cache artifacts are written to ``TORCHINDUCTOR_CACHE_DIR`` which by default will look like ``/tmp/torchinductor_myusername``.
+
+
+Remote Caching
+----------------
+
+We also provide a remote caching option for users who would like to take advantage of a Redis based cache. Check out `Compile Time Caching Configurations <https://pytorch.org/tutorials/recipes/torch_compile_caching_configuration_tutorial.html>`__ to learn more about how to enable the Redis-based caching.
+
+
+Conclusion
+-------------
+In this recipe, we have learned that PyTorch Inductor's caching mechanisms significantly reduce compilation latency by utilizing both local and remote caches, which operate seamlessly in the background without requiring user intervention.
+
diff --git a/recipes_source/torch_compile_torch_function_modes.py b/recipes_source/torch_compile_torch_function_modes.py
new file mode 100644
index 00000000000..7808579563e
--- /dev/null
+++ b/recipes_source/torch_compile_torch_function_modes.py
@@ -0,0 +1,77 @@
+"""
+(beta) Utilizing Torch Function modes with torch.compile
+============================================================
+
+**Author:** `Michael Lazos <https://github.com/mlazos>`_
+"""
+
+#########################################################
+#  This recipe covers how to use a key torch extensibility point, 
+#  torch function modes, in tandem with ``torch.compile`` to override 
+#  the behavior of torch operators, also know as **ops**, at trace time, with no runtime overhead.
+#
+# .. note::
+#
+#    This recipe requires PyTorch 2.7.0 or later.
+
+
+#####################################################################
+# Rewriting a torch op (torch.add -> torch.mul)
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# For this example, we'll use torch function modes to rewrite occurences
+# of addition with multiply instead. This type of override can be common 
+# if a certain backend has a custom implementation that should be dispatched
+# for a given op. 
+import torch
+
+# exit cleanly if we are on a device that doesn't support ``torch.compile``
+if torch.cuda.get_device_capability() < (7, 0):
+    print("Exiting because torch.compile is not supported on this device.")
+    import sys
+    sys.exit(0)
+
+from torch.overrides import BaseTorchFunctionMode
+
+# Define our mode, Note: ``BaseTorchFunctionMode``
+# implements the actual invocation of func(..)
+class AddToMultiplyMode(BaseTorchFunctionMode):
+    def __torch_function__(self, func, types, args=(), kwargs=None):
+        if func == torch.Tensor.add:
+            func = torch.mul
+
+        return super().__torch_function__(func, types, args, kwargs)
+
+@torch.compile()
+def test_fn(x, y):
+    return x + y * x # Note: infix operators map to torch.Tensor.* methods
+
+x = torch.rand(2, 2)
+y = torch.rand_like(x)
+
+with AddToMultiplyMode():
+    z = test_fn(x, y)
+
+assert torch.allclose(z, x * y * x)
+
+# The mode can also be used within the compiled region as well like this:
+
+@torch.compile()
+def test_fn(x, y):
+    with AddToMultiplyMode():
+        return x + y * x # Note: infix operators map to torch.Tensor.* methods
+
+x = torch.rand(2, 2)
+y = torch.rand_like(x)
+z = test_fn(x, y)
+
+assert torch.allclose(z, x * y * x)
+
+######################################################################
+# Conclusion
+# ~~~~~~~~~~
+# In this recipe we demonstrated how to override the behavior of ``torch.*`` operators
+# using torch function modes from within ``torch.compile``. This enables users to utilize
+# the extensibility benefits of torch function modes without the runtime overhead
+# of calling torch function on every op invocation. 
+# 
+# * See `Extending Torch API with Modes <https://pytorch.org/docs/stable/notes/extending.html#extending-all-torch-api-with-modes>`__  for other examples and background on Torch Function modes.
diff --git a/recipes_source/torch_compile_user_defined_triton_kernel_tutorial.py b/recipes_source/torch_compile_user_defined_triton_kernel_tutorial.py
new file mode 100644
index 00000000000..10ecd74ce91
--- /dev/null
+++ b/recipes_source/torch_compile_user_defined_triton_kernel_tutorial.py
@@ -0,0 +1,372 @@
+# -*- coding: utf-8 -*-
+
+"""
+Using User-Defined Triton Kernels with ``torch.compile``
+=========================================================
+**Author:** `Oguz Ulgen <https://github.com/oulgen>`_
+"""
+
+######################################################################
+# User-defined Triton kernels can be used to optimize specific parts of your
+# model's computation. These kernels are written in Triton's language, which is designed
+# to make it easier to achieve peak hardware performance. By using user-defined Triton
+# kernels with ``torch.compile``, you can integrate these optimized computations into
+# your PyTorch model, potentially achieving significant performance improvements.
+#
+# This recipes demonstrates how you can use user-defined Triton kernels with ``torch.compile``.
+#
+# Prerequisites
+# -------------------
+#
+# Before starting this recipe, make sure that you have the following:
+#
+# * Basic understanding of ``torch.compile`` and Triton. See:
+#
+#   * `torch.compiler API documentation <https://pytorch.org/docs/stable/torch.compiler.html#torch-compiler>`__
+#   * `Introduction to torch.compile <https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html>`__
+#   * `Triton language documentation <https://triton-lang.org/main/index.html>`__
+#
+# * PyTorch 2.3 or later
+# * A GPU that supports Triton
+#
+
+import torch
+from torch.utils._triton import has_triton
+
+######################################################################
+# Basic Usage
+# --------------------
+#
+# In this example, we will use a simple vector addition kernel from the Triton documentation
+# with ``torch.compile``.
+# For reference, see `Triton documentation <https://triton-lang.org/main/getting-started/tutorials/01-vector-add.html>`__.
+#
+
+if not has_triton():
+    print("Skipping because triton is not supported on this device.")
+else:
+    import triton
+    from triton import language as tl
+
+    @triton.jit
+    def add_kernel(
+        in_ptr0,
+        in_ptr1,
+        out_ptr,
+        n_elements,
+        BLOCK_SIZE: "tl.constexpr",
+    ):
+        pid = tl.program_id(axis=0)
+        block_start = pid * BLOCK_SIZE
+        offsets = block_start + tl.arange(0, BLOCK_SIZE)
+        mask = offsets < n_elements
+        x = tl.load(in_ptr0 + offsets, mask=mask)
+        y = tl.load(in_ptr1 + offsets, mask=mask)
+        output = x + y
+        tl.store(out_ptr + offsets, output, mask=mask)
+
+    @torch.compile(fullgraph=True)
+    def add_fn(x, y):
+        output = torch.zeros_like(x)
+        n_elements = output.numel()
+        grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)
+        add_kernel[grid](x, y, output, n_elements, BLOCK_SIZE=4)
+        return output
+
+    x = torch.randn(4, device="cuda")
+    y = torch.randn(4, device="cuda")
+    out = add_fn(x, y)
+    print(f"Vector addition of\nX:\t{x}\nY:\t{y}\nis equal to\n{out}")
+
+######################################################################
+# Advanced Usage
+# -------------------------------------------------------------------
+#
+# Triton's autotune feature is a powerful tool that automatically optimizes the configuration
+# parameters of your Triton kernels. It explores a range of possible configurations and
+# selects the one that delivers the best performance for your specific use case.
+#
+# When used with ``torch.compile``, ``triton.autotune`` can help ensure that your PyTorch
+# model is running as efficiently as possible. Here is an example of using ``torch.compile``
+# and ``triton.autotune``.
+#
+# .. note::
+#
+#   ``torch.compile`` only supports configs and key arguments to ``triton.autotune``.
+
+if not has_triton():
+    print("Skipping because triton is not supported on this device.")
+else:
+    import triton
+    from triton import language as tl
+
+    @triton.autotune(
+        configs=[
+            triton.Config({"BLOCK_SIZE": 4}, num_stages=3, num_warps=8),
+            triton.Config({"BLOCK_SIZE": 4}, num_stages=4, num_warps=4),
+            triton.Config({"BLOCK_SIZE": 2}, num_stages=3, num_warps=8),
+            triton.Config({"BLOCK_SIZE": 2}, num_stages=4, num_warps=4),
+        ],
+        key=[],
+    )
+    @triton.jit
+    def add_kernel_autotuned(
+        in_ptr0,
+        in_ptr1,
+        out_ptr,
+        n_elements,
+        BLOCK_SIZE: "tl.constexpr",
+    ):
+        pid = tl.program_id(axis=0)
+        block_start = pid * BLOCK_SIZE
+        offsets = block_start + tl.arange(0, BLOCK_SIZE)
+        mask = offsets < n_elements
+        x = tl.load(in_ptr0 + offsets, mask=mask)
+        y = tl.load(in_ptr1 + offsets, mask=mask)
+        output = x + y
+        tl.store(out_ptr + offsets, output, mask=mask)
+
+    @torch.compile(fullgraph=True)
+    def add_fn(x, y):
+        output = torch.zeros_like(x)
+        n_elements = output.numel()
+        grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)
+        add_kernel_autotuned[grid](x, y, output, n_elements)
+        return output
+
+    x = torch.randn(4, device="cuda")
+    y = torch.randn(4, device="cuda")
+    out = add_fn(x, y)
+    print(f"Vector addition of\nX:\t{x}\nY:\t{y}\nis equal to\n{out}")
+
+######################################################################
+# Composability
+# -------------------------------------------------------------------
+#
+# User-defined Triton kernels do not automatically support all PyTorch
+# subsystems. This can be seen in the following use cases:
+#
+# - Adding a CPU fallback
+# - Adding a ``FlopCounter`` formula
+# - Composing with Tensor Subclasses
+#
+# To compose with additional PyTorch subsystems, use ``torch.library.triton_op``.
+#
+# ``triton_op is`` a structured way of defining a custom operator that is backed by one
+# or more Triton kernels: like regular custom operators (``torch.library.custom_op``),
+# you are able to specify the interactions with PyTorch subsystems via ``torch.library``.
+# However, unlike ``torch.library.custom_op``, which creates opaque callables with respect to
+# ``torch.compile``, ``torch.compile`` traces into ``triton_op`` to apply optimizations.
+#
+# Here’s a chart of which API to use when integrating Triton kernels with PyTorch.
+#
+# .. list-table::
+#    :header-rows: 1
+#
+#    * -
+#      - Triton kernel (no explicit ``torch.library`` wrapper)
+#      - ``torch.library.triton_op``
+#      - ``torch.library.custom_op``
+#    * - Supports inference
+#      - Yes
+#      - Yes
+#      - Yes
+#    * - Supports training
+#      - In the majority of cases
+#      - Yes
+#      - Yes
+#    * - Supports ``torch.compile``
+#      - Yes
+#      - Yes
+#      - Yes
+#    * - Supports ``torch.compile(fullgraph=True)``
+#      - In the majority of cases
+#      - In the majority of cases
+#      - In all cases
+#    * - Does torch.compile trace into the implementation?
+#      - Yes
+#      - Yes
+#      - No
+#    * - Supports AOTInductor
+#      - Yes
+#      - Yes
+#      - No
+#    * - Supports PyTorch Subsystems like FlopCounterMode, CPU Fallback, Tensor Subclasses
+#      - No
+#      - Yes
+#      - Yes
+
+######################################################################
+# Wrapping Triton kernels with ``triton_op``
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+#
+# Use ``torch.library.triton_op`` to wrap a function that may invoke one or more Triton kernels.
+# Use ``torch.library.wrap_triton`` to wrap the calls to the Triton kernel.
+
+from torch.library import triton_op, wrap_triton
+
+@triton_op("mylib::mysin", mutates_args={})
+def mysin(x: torch.Tensor) -> torch.Tensor:
+    out = torch.empty_like(x)
+    n_elements = x.numel()
+    wrap_triton(sin_kernel)[(n_elements,)](x, out, n_elements, BLOCK_SIZE=4)
+    return out
+
+@triton.jit
+def sin_kernel(
+    in_ptr0,
+    out_ptr,
+    n_elements,
+    BLOCK_SIZE: "tl.constexpr",
+):
+    pid = tl.program_id(axis=0)
+    block_start = pid * BLOCK_SIZE
+    offsets = block_start + tl.arange(0, BLOCK_SIZE)
+    mask = offsets < n_elements
+    x = tl.load(in_ptr0 + offsets, mask=mask)
+    output = tl.sin(x)
+    tl.store(out_ptr + offsets, output, mask=mask)
+
+######################################################################
+# You can invoke the ``triton_op`` in one of the following two ways.
+
+x = torch.randn(3, device="cuda")
+y = mysin(x)
+z = torch.ops.mylib.mysin.default(x)
+
+assert torch.allclose(y, x.sin())
+assert torch.allclose(z, x.sin())
+
+######################################################################
+# The resulting ``triton_op`` works with ``torch.compile`` and ``AOTInductor``.
+
+y = torch.compile(mysin)(x)
+assert torch.allclose(y, x.sin())
+
+######################################################################
+# Adding training support
+# ^^^^^^^^^^^^^^^^^^^^^^^
+#
+# Use ``register_autograd`` to add an autograd formula for the ``triton_op``.
+# Prefer this to using ``torch.autograd.Function`` (which has various composability footguns
+# with ``torch.compile``).
+
+def backward(ctx, grad):
+    x, = ctx.saved_tensors
+    return grad * x.cos()
+
+def setup_context(ctx, inputs, output):
+    x, = inputs
+    ctx.save_for_backward(x)
+
+mysin.register_autograd(backward, setup_context=setup_context)
+
+######################################################################
+# Note that the backward must be a composition of PyTorch-understood operators.
+# If you want the backward to call Triton kernels, then those must be wrapped in ``triton_op`` as well:
+
+@triton.jit
+def cos_kernel(
+    in_ptr0,
+    out_ptr,
+    n_elements,
+    BLOCK_SIZE: "tl.constexpr",
+):
+    pid = tl.program_id(axis=0)
+    block_start = pid * BLOCK_SIZE
+    offsets = block_start + tl.arange(0, BLOCK_SIZE)
+    mask = offsets < n_elements
+    x = tl.load(in_ptr0 + offsets, mask=mask)
+    output = tl.cos(x)
+    tl.store(out_ptr + offsets, output, mask=mask)
+
+@triton_op("mylib::mycos", mutates_args={})
+def mycos(x: torch.Tensor) -> torch.Tensor:
+    out = torch.empty_like(x)
+    n_elements = x.numel()
+    wrap_triton(cos_kernel)[(n_elements,)](x, out, n_elements, BLOCK_SIZE=4)
+    return out
+
+def backward(ctx, grad):
+    x, = ctx.saved_tensors
+    return grad * mycos(x)
+
+def setup_context(ctx, inputs, output):
+    x, = inputs
+    ctx.save_for_backward(x)
+
+mysin.register_autograd(backward, setup_context=setup_context)
+
+######################################################################
+# Adding a CPU Fallback
+# ^^^^^^^^^^^^^^^^^^^^^
+# Triton kernels don’t run on CPU. Use  ``register_kernel`` to add a CPU (or any other device) fallback for the ``triton_op``:
+
+@mysin.register_kernel("cpu")
+def _(x):
+    return torch.sin(x)
+
+x = torch.randn(3)
+y = mysin(x)
+assert torch.allclose(y, x.sin())
+
+######################################################################
+# The fallback must be composed of PyTorch operators.
+
+######################################################################
+# Adding a FlopCounter Formula
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+#
+# To specify how many flops the triton kernel reports under PyTorch's flop counter,
+# use ``register_flop_formula``.
+
+from torch.utils.flop_counter import FlopCounterMode, register_flop_formula
+
+@register_flop_formula(torch.ops.mylib.mysin)
+def _(x_shape):
+    numel = 1
+    for s in x_shape:
+        numel *= s
+    return numel
+
+x = torch.randn(3, device="cuda")
+
+#########################################################
+# ``FlopCounterMode`` requires `tabulate <https://pypi.org/project/tabulate/>`__.
+# Before running the code below, make sure you have ``tabulate`` installed or install by
+# running ``pip install tabulate``. 
+#
+# >>> with FlopCounterMode() as flop_counter:
+# >>>     y = mysin(x)
+
+######################################################################
+# Limitations
+# --------------------------------------------------------------------
+#
+# As of PyTorch 2.3, the support for user-defined Triton kernels in ``torch.compile``
+# includes dynamic shapes, ``torch.autograd.Function``, JIT inductor, and AOT inductor.
+# You can use these features together to build complex, high-performance models.
+#
+# PyTorch 2.6 added ``torch.library.triton_op``, which adds support for
+# user-defined Triton kernels in tensor subclasses and other advanced features.
+#
+# However, there are certain limitations to be aware of:
+#
+# * **Triton Features:** While ``triton.heuristics`` can be used either standalone or
+#   before ``triton.autotune``, it cannot be used after ``triton.autotune``. This
+#   implies that if ``triton.heuristics`` and ``triton.autotune`` are to be used
+#   together, ``triton.heuristics`` must be used first.
+#
+# Conclusion
+# -----------
+# In this recipe, we explored how to utilize user-defined Triton kernels
+# with ``torch.compile``. We delved into the basic usage of a simple
+# vector addition kernel and advanced usage involving Triton's autotune
+# feature. We also discussed the composability of user-defined Triton
+# kernels with other PyTorch features and highlighted some current limitations.
+#
+# See Also
+# ---------
+#
+# * `Compiling the Optimizers <https://pytorch.org/tutorials/recipes/compiling_optimizer.html>`__
+# * `Implementing High-Performance Transformers with Scaled Dot Product Attention <https://pytorch.org/tutorials/intermediate/scaled_dot_product_attention_tutorial.html>`__
diff --git a/recipes_source/torch_compiler_set_stance_tutorial.py b/recipes_source/torch_compiler_set_stance_tutorial.py
new file mode 100644
index 00000000000..56b338db801
--- /dev/null
+++ b/recipes_source/torch_compiler_set_stance_tutorial.py
@@ -0,0 +1,244 @@
+# -*- coding: utf-8 -*-
+
+"""
+Dynamic Compilation Control with ``torch.compiler.set_stance``
+=========================================================================
+**Author:** `William Wen <https://github.com/williamwen42>`_
+"""
+
+######################################################################
+# ``torch.compiler.set_stance`` is a ``torch.compiler`` API that
+# enables you to change the behavior of ``torch.compile`` across different
+# calls to your model without having to reapply ``torch.compile`` to your model.
+#
+# This recipe provides some examples on how to use ``torch.compiler.set_stance``.
+#
+#
+# .. contents::
+#     :local:
+#
+# Prerequisites
+# ---------------
+#
+# - ``torch >= 2.6``
+
+######################################################################
+# Description
+# -----------
+# ``torch.compile.set_stance`` can be used as a decorator, context manager, or raw function
+# to change the behavior of ``torch.compile`` across different calls to your model.
+#
+# In the example below, the ``"force_eager"`` stance ignores all ``torch.compile`` directives.
+
+import torch
+
+
+@torch.compile
+def foo(x):
+    if torch.compiler.is_compiling():
+        # torch.compile is active
+        return x + 1
+    else:
+        # torch.compile is not active
+        return x - 1
+
+
+inp = torch.zeros(3)
+
+print(foo(inp))  # compiled, prints 1
+
+######################################################################
+# Sample decorator usage
+
+
+@torch.compiler.set_stance("force_eager")
+def bar(x):
+    # force disable the compiler
+    return foo(x)
+
+
+print(bar(inp))  # not compiled, prints -1
+
+######################################################################
+# Sample context manager usage
+
+with torch.compiler.set_stance("force_eager"):
+    print(foo(inp))  # not compiled, prints -1
+
+######################################################################
+# Sample raw function usage
+
+torch.compiler.set_stance("force_eager")
+print(foo(inp))  # not compiled, prints -1
+torch.compiler.set_stance("default")
+
+print(foo(inp))  # compiled, prints 1
+
+######################################################################
+# ``torch.compile`` stance can only be changed **outside** of any ``torch.compile`` region. Attempts
+# to do otherwise will result in an error.
+
+
+@torch.compile
+def baz(x):
+    # error!
+    with torch.compiler.set_stance("force_eager"):
+        return x + 1
+
+
+try:
+    baz(inp)
+except Exception as e:
+    print(e)
+
+
+@torch.compiler.set_stance("force_eager")
+def inner(x):
+    return x + 1
+
+
+@torch.compile
+def outer(x):
+    # error!
+    return inner(x)
+
+
+try:
+    outer(inp)
+except Exception as e:
+    print(e)
+
+######################################################################
+# Other stances include:
+#  - ``"default"``: The default stance, used for normal compilation.
+#  - ``"eager_on_recompile"``: Run code eagerly when a recompile is necessary. If there is cached compiled code valid for the input, it will still be used.
+#  - ``"fail_on_recompile"``: Raise an error when recompiling a function.
+#
+# See the ``torch.compiler.set_stance`` `doc page <https://pytorch.org/docs/main/generated/torch.compiler.set_stance.html#torch.compiler.set_stance>`__
+# for more stances and options. More stances/options may also be added in the future.
+
+######################################################################
+# Examples
+# --------
+
+######################################################################
+# Preventing recompilation
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# Some models do not expect any recompilations - for example, you may always have inputs with the same shape.
+# Since recompilations may be expensive, we may wish to error out when we attempt to recompile so we can detect and fix recompilation cases.
+# The ``"fail_on_recompilation"`` stance can be used for this.
+
+
+@torch.compile
+def my_big_model(x):
+    return torch.relu(x)
+
+
+# first compilation
+my_big_model(torch.randn(3))
+
+with torch.compiler.set_stance("fail_on_recompile"):
+    my_big_model(torch.randn(3))  # no recompilation - OK
+    try:
+        my_big_model(torch.randn(4))  # recompilation - error
+    except Exception as e:
+        print(e)
+
+######################################################################
+# If erroring out is too disruptive, we can use ``"eager_on_recompile"`` instead,
+# which will cause ``torch.compile`` to fall back to eager instead of erroring out.
+# This may be useful if we don't expect recompilations to happen frequently, but
+# when one is required, we'd rather pay the cost of running eagerly over the cost of recompilation.
+
+
+@torch.compile
+def my_huge_model(x):
+    if torch.compiler.is_compiling():
+        return x + 1
+    else:
+        return x - 1
+
+
+# first compilation
+print(my_huge_model(torch.zeros(3)))  # 1
+
+with torch.compiler.set_stance("eager_on_recompile"):
+    print(my_huge_model(torch.zeros(3)))  # 1
+    print(my_huge_model(torch.zeros(4)))  # -1
+    print(my_huge_model(torch.zeros(3)))  # 1
+
+
+######################################################################
+# Measuring performance gains
+# ===========================
+#
+# ``torch.compiler.set_stance`` can be used to compare eager vs. compiled performance
+# without having to define a separate eager model.
+
+
+# Returns the result of running `fn()` and the time it took for `fn()` to run,
+# in seconds. We use CUDA events and synchronization for the most accurate
+# measurements.
+def timed(fn):
+    start = torch.cuda.Event(enable_timing=True)
+    end = torch.cuda.Event(enable_timing=True)
+    start.record()
+    result = fn()
+    end.record()
+    torch.cuda.synchronize()
+    return result, start.elapsed_time(end) / 1000
+
+
+@torch.compile
+def my_gigantic_model(x, y):
+    x = x @ y
+    x = x @ y
+    x = x @ y
+    return x
+
+
+inps = torch.randn(5, 5), torch.randn(5, 5)
+
+with torch.compiler.set_stance("force_eager"):
+    print("eager:", timed(lambda: my_gigantic_model(*inps))[1])
+
+# warmups
+for _ in range(3):
+    my_gigantic_model(*inps)
+
+print("compiled:", timed(lambda: my_gigantic_model(*inps))[1])
+
+
+######################################################################
+# Crashing sooner
+# ===============
+#
+# Running an eager iteration first before a compiled iteration using the ``"force_eager"`` stance
+# can help us to catch errors unrelated to ``torch.compile`` before attempting a very long compile.
+
+
+@torch.compile
+def my_humongous_model(x):
+    return torch.sin(x, x)
+
+
+try:
+    with torch.compiler.set_stance("force_eager"):
+        print(my_humongous_model(torch.randn(3)))
+    # this call to the compiled model won't run
+    print(my_humongous_model(torch.randn(3)))
+except Exception as e:
+    print(e)
+
+########################################
+# Conclusion
+# --------------
+# In this recipe, we have learned how to use the ``torch.compiler.set_stance`` API
+# to modify the behavior of ``torch.compile`` across different calls to a model
+# without needing to reapply it. The recipe demonstrates using
+# ``torch.compiler.set_stance`` as a decorator, context manager, or raw function
+# to control compilation stances like ``force_eager``, ``default``,
+# ``eager_on_recompile``, and "fail_on_recompile." 
+# 
+# For more information, see: `torch.compiler.set_stance API documentation <https://pytorch.org/docs/main/generated/torch.compiler.set_stance.html#torch.compiler.set_stance>`__.
diff --git a/recipes_source/torch_export_aoti_python.py b/recipes_source/torch_export_aoti_python.py
new file mode 100644
index 00000000000..c0cbb7e2800
--- /dev/null
+++ b/recipes_source/torch_export_aoti_python.py
@@ -0,0 +1,276 @@
+# -*- coding: utf-8 -*-
+
+"""
+.. meta::
+   :description: An end-to-end example of how to use AOTInductor for Python runtime.
+   :keywords: torch.export, AOTInductor, torch._inductor.aoti_compile_and_package, aot_compile, torch._export.aoti_load_package
+
+``torch.export`` AOTInductor Tutorial for Python runtime (Beta)
+===============================================================
+**Author:** Ankith Gunapal, Bin Bao, Angela Yi
+"""
+
+######################################################################
+#
+# .. warning::
+#
+#     ``torch._inductor.aoti_compile_and_package`` and
+#     ``torch._inductor.aoti_load_package`` are in Beta status and are subject
+#     to backwards compatibility breaking changes. This tutorial provides an
+#     example of how to use these APIs for model deployment using Python
+#     runtime.
+#
+# It has been shown `previously
+# <https://pytorch.org/docs/stable/torch.compiler_aot_inductor.html#>`__ how
+# AOTInductor can be used to do Ahead-of-Time compilation of PyTorch exported
+# models by creating an artifact that can be run in a non-Python environment.
+# In this tutorial, you will learn an end-to-end example of how to use
+# AOTInductor for Python runtime.
+#
+# **Contents**
+#
+# .. contents::
+#     :local:
+
+######################################################################
+# Prerequisites
+# -------------
+# * PyTorch 2.6 or later
+# * Basic understanding of ``torch.export`` and AOTInductor
+# * Complete the `AOTInductor: Ahead-Of-Time Compilation for Torch.Export-ed Models <https://pytorch.org/docs/stable/torch.compiler_aot_inductor.html#>`_ tutorial
+
+######################################################################
+# What you will learn
+# ----------------------
+# * How to use AOTInductor for Python runtime.
+# * How to use :func:`torch._inductor.aoti_compile_and_package` along with :func:`torch.export.export` to generate a compiled artifact
+# * How to load and run the artifact in a Python runtime using :func:`torch._export.aot_load`.
+# * When to you use AOTInductor with a Python runtime
+
+######################################################################
+# Model Compilation
+# -----------------
+#
+# We will use the TorchVision pretrained ``ResNet18`` model as an example.
+#
+# The first step is to export the model to a graph representation using
+# :func:`torch.export.export`. To learn more about using this function, you can
+# check out the `docs <https://pytorch.org/docs/main/export.html>`_ or the
+# `tutorial <https://pytorch.org/tutorials/intermediate/torch_export_tutorial.html>`_.
+#
+# Once we have exported the PyTorch model and obtained an ``ExportedProgram``,
+# we can apply :func:`torch._inductor.aoti_compile_and_package` to AOTInductor
+# to compile the program to a specified device, and save the generated contents
+# into a ".pt2" artifact.
+#
+# .. note::
+#
+#       This API supports the same available options that :func:`torch.compile`
+#       has, such as ``mode`` and ``max_autotune`` (for those who want to enable
+#       CUDA graphs and leverage Triton based matrix multiplications and
+#       convolutions)
+
+import os
+import torch
+import torch._inductor
+from torchvision.models import ResNet18_Weights, resnet18
+
+model = resnet18(weights=ResNet18_Weights.DEFAULT)
+model.eval()
+
+with torch.inference_mode():
+    inductor_configs = {}
+
+    if torch.cuda.is_available():
+        device = "cuda"
+        inductor_configs["max_autotune"] = True
+    else:
+        device = "cpu"
+
+    model = model.to(device=device)
+    example_inputs = (torch.randn(2, 3, 224, 224, device=device),)
+
+    exported_program = torch.export.export(
+        model,
+        example_inputs,
+    )
+    path = torch._inductor.aoti_compile_and_package(
+        exported_program,
+        package_path=os.path.join(os.getcwd(), "resnet18.pt2"),
+        inductor_configs=inductor_configs
+    )
+
+######################################################################
+# The result of :func:`aoti_compile_and_package` is an artifact "resnet18.pt2"
+# which can be loaded and executed in Python and C++.
+# 
+# The artifact itself contains a bunch of AOTInductor generated code, such as
+# a generated C++ runner file, a shared library compiled from the C++ file, and
+# CUDA binary files, aka cubin files, if optimizing for CUDA.
+# 
+# Structure-wise, the artifact is a structured ``.zip`` file, with the following 
+# specification:
+#
+# .. code::
+#    .
+#    ├── archive_format
+#    ├── version
+#    ├── data
+#    │   ├── aotinductor
+#    │   │   └── model
+#    │   │       ├── xxx.cpp            # AOTInductor generated cpp file
+#    │   │       ├── xxx.so             # AOTInductor generated shared library
+#    │   │       ├── xxx.cubin          # Cubin files (if running on CUDA)
+#    │   │       └── xxx_metadata.json  # Additional metadata to save
+#    │   ├── weights
+#    │   │  └── TBD
+#    │   └── constants
+#    │      └── TBD
+#    └── extra
+#        └── metadata.json
+#
+# We can use the following command to inspect the artifact contents:
+#
+# .. code:: bash
+#
+#    $ unzip -l resnet18.pt2
+#
+# .. code::
+#
+#    Archive:  resnet18.pt2
+#      Length      Date    Time    Name
+#    ---------  ---------- -----   ----
+#            1  01-08-2025 16:40   version
+#            3  01-08-2025 16:40   archive_format
+#        10088  01-08-2025 16:40   data/aotinductor/model/cagzt6akdaczvxwtbvqe34otfe5jlorktbqlojbzqjqvbfsjlge4.cubin
+#        17160  01-08-2025 16:40   data/aotinductor/model/c6oytfjmt5w4c7onvtm6fray7clirxt7q5xjbwx3hdydclmwoujz.cubin
+#        16616  01-08-2025 16:40   data/aotinductor/model/c7ydp7nocyz323hij4tmlf2kcedmwlyg6r57gaqzcsy3huneamu6.cubin
+#        17776  01-08-2025 16:40   data/aotinductor/model/cyqdf46ordevqhiddvpdpp3uzwatfbzdpl3auj2nx23uxvplnne2.cubin
+#        10856  01-08-2025 16:40   data/aotinductor/model/cpzfebfgrusqslui7fxsuoo4tvwulmrxirc5tmrpa4mvrbdno7kn.cubin
+#        14608  01-08-2025 16:40   data/aotinductor/model/c5ukeoz5wmaszd7vczdz2qhtt6n7tdbl3b6wuy4rb2se24fjwfoy.cubin
+#        11376  01-08-2025 16:40   data/aotinductor/model/csu3nstcp56tsjfycygaqsewpu64l5s6zavvz7537cm4s4cv2k3r.cubin
+#        10984  01-08-2025 16:40   data/aotinductor/model/cp76lez4glmgq7gedf2u25zvvv6rksv5lav4q22dibd2zicbgwj3.cubin
+#        14736  01-08-2025 16:40   data/aotinductor/model/c2bb5p6tnwz4elgujqelsrp3unvkgsyiv7xqxmpvuxcm4jfl7pc2.cubin
+#        11376  01-08-2025 16:40   data/aotinductor/model/c6eopmb2b4ngodwsayae4r5q6ni3jlfogfbdk3ypg56tgpzhubfy.cubin
+#        11624  01-08-2025 16:40   data/aotinductor/model/chmwe6lvoekzfowdbiizitm3haiiuad5kdm6sd2m6mv6dkn2zk32.cubin
+#        15632  01-08-2025 16:40   data/aotinductor/model/c3jop5g344hj3ztsu4qm6ibxyaaerlhkzh2e6emak23rxfje6jam.cubin
+#        25472  01-08-2025 16:40   data/aotinductor/model/chaiixybeiuuitm2nmqnxzijzwgnn2n7uuss4qmsupgblfh3h5hk.cubin
+#       139389  01-08-2025 16:40   data/aotinductor/model/cvk6qzuybruhwxtfblzxiov3rlrziv5fkqc4mdhbmantfu3lmd6t.cpp
+#           27  01-08-2025 16:40   data/aotinductor/model/cvk6qzuybruhwxtfblzxiov3rlrziv5fkqc4mdhbmantfu3lmd6t_metadata.json
+#     47195424  01-08-2025 16:40   data/aotinductor/model/cvk6qzuybruhwxtfblzxiov3rlrziv5fkqc4mdhbmantfu3lmd6t.so
+#    ---------                     -------
+#     47523148                     18 files
+
+
+######################################################################
+# Model Inference in Python
+# -------------------------
+#
+# To load and run the artifact in Python, we can use :func:`torch._inductor.aoti_load_package`.
+#
+
+import os
+import torch
+import torch._inductor
+
+model_path = os.path.join(os.getcwd(), "resnet18.pt2")
+
+compiled_model = torch._inductor.aoti_load_package(model_path)
+example_inputs = (torch.randn(2, 3, 224, 224, device=device),)
+
+with torch.inference_mode():
+    output = compiled_model(example_inputs)
+
+
+######################################################################
+# When to use AOTInductor with a Python Runtime
+# ---------------------------------------------
+#
+# There are mainly two reasons why one would use AOTInductor with a Python Runtime:
+#
+# -  ``torch._inductor.aoti_compile_and_package`` generates a singular
+#    serialized artifact. This is useful for model versioning for deployments
+#    and tracking model performance over time.
+# -  With :func:`torch.compile` being a JIT compiler, there is a warmup
+#    cost associated with the first compilation. Your deployment needs to
+#    account for the compilation time taken for the first inference. With
+#    AOTInductor, the compilation is done ahead of time using
+#    ``torch.export.export`` and ``torch._inductor.aoti_compile_and_package``.
+#    At deployment time, after loading the model, running inference does not
+#    have any additional cost.
+#
+#
+# The section below shows the speedup achieved with AOTInductor for first inference
+#
+# We define a utility function ``timed`` to measure the time taken for inference
+#
+
+import time
+def timed(fn):
+    # Returns the result of running `fn()` and the time it took for `fn()` to run,
+    # in seconds. We use CUDA events and synchronization for accurate
+    # measurement on CUDA enabled devices.
+    if torch.cuda.is_available():
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        start.record()
+    else:
+        start = time.time()
+
+    result = fn()
+    if torch.cuda.is_available():
+        end.record()
+        torch.cuda.synchronize()
+    else:
+        end = time.time()
+
+    # Measure time taken to execute the function in miliseconds
+    if torch.cuda.is_available():
+        duration = start.elapsed_time(end)
+    else:
+        duration = (end - start) * 1000
+
+    return result, duration
+
+
+######################################################################
+# Lets measure the time for first inference using AOTInductor
+
+torch._dynamo.reset()
+
+model = torch._inductor.aoti_load_package(model_path)
+example_inputs = (torch.randn(1, 3, 224, 224, device=device),)
+
+with torch.inference_mode():
+    _, time_taken = timed(lambda: model(example_inputs))
+    print(f"Time taken for first inference for AOTInductor is {time_taken:.2f} ms")
+
+
+######################################################################
+# Lets measure the time for first inference using ``torch.compile``
+
+torch._dynamo.reset()
+
+model = resnet18(weights=ResNet18_Weights.DEFAULT).to(device)
+model.eval()
+
+model = torch.compile(model)
+example_inputs = torch.randn(1, 3, 224, 224, device=device)
+
+with torch.inference_mode():
+    _, time_taken = timed(lambda: model(example_inputs))
+    print(f"Time taken for first inference for torch.compile is {time_taken:.2f} ms")
+
+######################################################################
+# We see that there is a drastic speedup in first inference time using AOTInductor compared
+# to ``torch.compile``
+
+######################################################################
+# Conclusion
+# ----------
+#
+# In this recipe, we have learned how to effectively use the AOTInductor for Python runtime by 
+# compiling and loading a pretrained ``ResNet18`` model. This process
+# demonstrates the practical application of generating a compiled artifact and
+# running it within a Python environment. We also looked at the advantage of using 
+# AOTInductor in model deployments, with regards to speed up in first inference time.
diff --git a/recipes_source/torch_export_challenges_solutions.rst b/recipes_source/torch_export_challenges_solutions.rst
new file mode 100644
index 00000000000..1f8b1ae45a4
--- /dev/null
+++ b/recipes_source/torch_export_challenges_solutions.rst
@@ -0,0 +1,331 @@
+Demonstration of torch.export flow, common challenges and the solutions to address them
+=======================================================================================
+**Authors:** `Ankith Gunapal <https://github.com/agunapal>`__, `Jordi Ramon <https://github.com/JordiFB>`__, `Marcos Carranza <https://github.com/macarran>`__
+
+In the `Introduction to torch.export Tutorial <https://pytorch.org/tutorials/intermediate/torch_export_tutorial.html>`__ , we learned how to use `torch.export <https://pytorch.org/docs/stable/export.html>`__.
+This tutorial expands on the previous one and explores the process of exporting popular models with code, as well as addresses common challenges that may arise with ``torch.export``.
+
+In this tutorial, you will learn how to export models for these use cases:
+
+* Video classifier (`MViT <https://pytorch.org/vision/main/models/video_mvit.html>`__)
+* Automatic Speech Recognition (`OpenAI Whisper-Tiny <https://huggingface.co/openai/whisper-tiny>`__)
+* Image Captioning (`BLIP <https://github.com/salesforce/BLIP>`__)
+* Promptable Image Segmentation (`SAM2 <https://ai.meta.com/sam2/>`__)
+
+Each of the four models were chosen to demonstrate unique features of `torch.export`, as well as some practical considerations
+and issues faced in the implementation.
+
+Prerequisites
+-------------
+
+* PyTorch 2.4 or later
+* Basic understanding of ``torch.export`` and PyTorch Eager inference.
+
+
+Key requirement for ``torch.export``: No graph break
+----------------------------------------------------
+
+`torch.compile <https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html>`__ speeds up PyTorch code by using JIT to compile PyTorch code into optimized kernels. It optimizes the given model
+using ``TorchDynamo`` and creates an optimized graph , which is then lowered into the hardware using the backend specified in the API.
+When TorchDynamo encounters unsupported Python features, it breaks the computation graph, lets the default Python interpreter
+handle the unsupported code, and then resumes capturing the graph. This break in the computation graph is called a `graph break <https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html#torchdynamo-and-fx-graphs>`__.
+
+One of the key differences between ``torch.export`` and ``torch.compile`` is that ``torch.export`` doesn’t support graph breaks
+which means that the entire model or part of the model that you are exporting needs to be a single graph. This is because handling graph breaks
+involves interpreting the unsupported operation with default Python evaluation, which is incompatible with what ``torch.export`` is
+designed for. You can read details about the differences between the various PyTorch frameworks in this `link <https://pytorch.org/docs/main/export.html#existing-frameworks>`__
+
+You can identify graph breaks in your program by using the following command:
+
+.. code:: sh
+
+   TORCH_LOGS="graph_breaks" python <file_name>.py
+
+You will need to modify your program to get rid of graph breaks. Once resolved, you are ready to export the model.
+PyTorch runs `nightly benchmarks <https://hud.pytorch.org/benchmark/compilers>`__ for `torch.compile` on popular HuggingFace and TIMM models.
+Most of these models have no graph breaks.
+
+The models in this recipe have no graph breaks, but fail with `torch.export`.
+
+Video Classification
+--------------------
+
+MViT is a class of models based on `MultiScale Vision Transformers <https://arxiv.org/abs/2104.11227>`__. This model has been trained for video classification using the `Kinetics-400 Dataset <https://arxiv.org/abs/1705.06950>`__.
+This model with a relevant dataset can be used for action recognition in the context of gaming.
+
+
+The code below exports MViT by tracing with ``batch_size=2`` and then checks if the ExportedProgram can run with ``batch_size=4``.
+
+.. code:: python
+
+   import numpy as np
+   import torch
+   from torchvision.models.video import MViT_V1_B_Weights, mvit_v1_b
+   import traceback as tb
+
+   model = mvit_v1_b(weights=MViT_V1_B_Weights.DEFAULT)
+
+   # Create a batch of 2 videos, each with 16 frames of shape 224x224x3.
+   input_frames = torch.randn(2,16, 224, 224, 3)
+   # Transpose to get [1, 3, num_clips, height, width].
+   input_frames = np.transpose(input_frames, (0, 4, 1, 2, 3))
+
+   # Export the model.
+   exported_program = torch.export.export(
+       model,
+       (input_frames,),
+   )
+
+   # Create a batch of 4 videos, each with 16 frames of shape 224x224x3.
+   input_frames = torch.randn(4,16, 224, 224, 3)
+   input_frames = np.transpose(input_frames, (0, 4, 1, 2, 3))
+   try:
+       exported_program.module()(input_frames)
+   except Exception:
+       tb.print_exc()
+
+
+Error: Static batch size
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: sh
+
+       raise RuntimeError(
+   RuntimeError: Expected input at *args[0].shape[0] to be equal to 2, but got 4
+
+
+By default, the exporting flow will trace the program assuming that all input shapes are static, so if you run the program with
+input shapes that are different than the ones you used while tracing, you will run into an error.
+
+Solution
+~~~~~~~~
+
+To address the error, we specify the first dimension of the input (``batch_size``) to be dynamic , specifying the expected range of ``batch_size``.
+In the corrected example shown below, we specify that the expected ``batch_size`` can range from 1 to 16.
+One detail to notice that ``min=2``  is not a bug and is explained in `The 0/1 Specialization Problem <https://docs.google.com/document/d/16VPOa3d-Liikf48teAOmxLc92rgvJdfosIy-yoT38Io/edit?fbclid=IwAR3HNwmmexcitV0pbZm_x1a4ykdXZ9th_eJWK-3hBtVgKnrkmemz6Pm5jRQ#heading=h.ez923tomjvyk>`__. A detailed description of dynamic shapes
+for ``torch.export`` can be found in the export tutorial. The code shown below demonstrates how to export mViT with dynamic batch sizes:
+
+.. code:: python
+
+   import numpy as np
+   import torch
+   from torchvision.models.video import MViT_V1_B_Weights, mvit_v1_b
+   import traceback as tb
+
+
+   model = mvit_v1_b(weights=MViT_V1_B_Weights.DEFAULT)
+
+   # Create a batch of 2 videos, each with 16 frames of shape 224x224x3.
+   input_frames = torch.randn(2,16, 224, 224, 3)
+
+   # Transpose to get [1, 3, num_clips, height, width].
+   input_frames = np.transpose(input_frames, (0, 4, 1, 2, 3))
+
+   # Export the model.
+   batch_dim = torch.export.Dim("batch", min=2, max=16)
+   exported_program = torch.export.export(
+       model,
+       (input_frames,),
+       # Specify the first dimension of the input x as dynamic
+       dynamic_shapes={"x": {0: batch_dim}},
+   )
+
+   # Create a batch of 4 videos, each with 16 frames of shape 224x224x3.
+   input_frames = torch.randn(4,16, 224, 224, 3)
+   input_frames = np.transpose(input_frames, (0, 4, 1, 2, 3))
+   try:
+       exported_program.module()(input_frames)
+   except Exception:
+       tb.print_exc()
+
+
+Automatic Speech Recognition
+---------------
+
+**Automatic Speech Recognition** (ASR) is the use of machine learning to transcribe spoken language into text.
+`Whisper <https://arxiv.org/abs/2212.04356>`__ is a Transformer based encoder-decoder model from OpenAI, which was trained on 680k hours of labelled data for ASR and speech translation.
+The code below tries to export ``whisper-tiny`` model for ASR.
+
+
+.. code:: python
+
+   import torch
+   from transformers import WhisperProcessor, WhisperForConditionalGeneration
+   from datasets import load_dataset
+
+   # load model
+   model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
+
+   # dummy inputs for exporting the model
+   input_features = torch.randn(1,80, 3000)
+   attention_mask = torch.ones(1, 3000)
+   decoder_input_ids = torch.tensor([[1, 1, 1 , 1]]) * model.config.decoder_start_token_id
+
+   model.eval()
+
+   exported_program: torch.export.ExportedProgram= torch.export.export(model, args=(input_features, attention_mask, decoder_input_ids,))
+
+
+
+Error: strict tracing with TorchDynamo
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code:: console
+
+   torch._dynamo.exc.InternalTorchDynamoError: AttributeError: 'DynamicCache' object has no attribute 'key_cache'
+
+
+By default ``torch.export`` traces your code using `TorchDynamo <https://pytorch.org/docs/stable/torch.compiler_dynamo_overview.html>`__, a byte-code analysis engine,  which symbolically analyzes your code and builds a graph.
+This analysis provides a stronger guarantee about safety but not all Python code is supported. When we export the ``whisper-tiny`` model  using the
+default strict mode, it typically returns an error in Dynamo due to an unsupported feature. To understand why this errors in Dynamo, you can refer to this `GitHub issue <https://github.com/pytorch/pytorch/issues/144906>`__.
+
+Solution
+~~~~~~~~
+
+To address the above error , ``torch.export`` supports  the ``non_strict`` mode where the program is traced using the Python interpreter, which works similar to
+PyTorch eager execution. The only difference is that all ``Tensor`` objects will be replaced by ``ProxyTensors``, which will record all their operations into
+a graph. By using ``strict=False``, we are able to export the program.
+
+.. code:: python
+
+   import torch
+   from transformers import WhisperProcessor, WhisperForConditionalGeneration
+   from datasets import load_dataset
+
+   # load model
+   model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
+
+   # dummy inputs for exporting the model
+   input_features = torch.randn(1,80, 3000)
+   attention_mask = torch.ones(1, 3000)
+   decoder_input_ids = torch.tensor([[1, 1, 1 , 1]]) * model.config.decoder_start_token_id
+
+   model.eval()
+
+   exported_program: torch.export.ExportedProgram= torch.export.export(model, args=(input_features, attention_mask, decoder_input_ids,), strict=False)
+
+Image Captioning
+----------------
+
+**Image Captioning** is the task of defining the contents of an image in words. In the context of gaming, Image Captioning can be used to enhance the
+gameplay experience by dynamically generating text description of the various game objects in the scene, thereby providing the gamer with additional
+details. `BLIP <https://arxiv.org/pdf/2201.12086>`__ is a popular model for Image Captioning `released by SalesForce Research <https://github.com/salesforce/BLIP>`__. The code below tries to export BLIP with ``batch_size=1``.
+
+
+.. code:: python
+
+   import torch
+   from models.blip import blip_decoder
+
+   device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+   image_size = 384
+   image = torch.randn(1, 3,384,384).to(device)
+   caption_input = ""
+
+   model_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth'
+   model = blip_decoder(pretrained=model_url, image_size=image_size, vit='base')
+   model.eval()
+   model = model.to(device)
+
+   exported_program: torch.export.ExportedProgram= torch.export.export(model, args=(image,caption_input,), strict=False)
+
+
+
+Error: Cannot mutate tensors with frozen storage
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+While exporting a model, it might fail because the model implementation might contain certain Python operations which are not yet supported by ``torch.export``.
+Some of these failures may have a workaround. BLIP is an example where the original model errors, which can be resolved by making a small change in the code.
+``torch.export`` lists the common cases of supported and unsupported operations in `ExportDB <https://pytorch.org/docs/main/generated/exportdb/index.html>`__ and shows how you can modify your code to make it export compatible.
+
+.. code:: console
+
+   File "/BLIP/models/blip.py", line 112, in forward
+       text.input_ids[:,0] = self.tokenizer.bos_token_id
+     File "/anaconda3/envs/export/lib/python3.10/site-packages/torch/_subclasses/functional_tensor.py", line 545, in __torch_dispatch__
+       outs_unwrapped = func._op_dk(
+   RuntimeError: cannot mutate tensors with frozen storage
+
+
+
+Solution
+~~~~~~~~
+
+Clone the `tensor <https://github.com/salesforce/BLIP/blob/main/models/blip.py#L112>`__ where export fails.
+
+.. code:: python
+
+   text.input_ids = text.input_ids.clone() # clone the tensor
+   text.input_ids[:,0] = self.tokenizer.bos_token_id
+
+.. note::
+   This constraint has been relaxed in PyTorch 2.7 nightlies. This should work out-of-the-box in PyTorch 2.7
+
+Promptable Image Segmentation
+-----------------------------
+
+**Image segmentation** is a computer vision technique that divides a digital image into distinct groups of pixels, or segments, based on their characteristics.
+`Segment Anything Model (SAM) <https://ai.meta.com/blog/segment-anything-foundation-model-image-segmentation/>`__) introduced promptable image segmentation, which predicts object masks given prompts that indicate the desired object. `SAM 2 <https://ai.meta.com/sam2/>`__ is
+the first unified model for segmenting objects across images and videos. The `SAM2ImagePredictor <https://github.com/facebookresearch/sam2/blob/main/sam2/sam2_image_predictor.py#L20>`__ class provides an easy interface to the model for prompting
+the model. The model can take as input both point and box prompts, as well as masks from the previous iteration of prediction. Since SAM2 provides strong
+zero-shot performance for object tracking, it can be used for tracking game objects in a scene.
+
+
+The tensor operations in the predict method of `SAM2ImagePredictor <https://github.com/facebookresearch/sam2/blob/main/sam2/sam2_image_predictor.py#L20>`__  are happening in the `_predict <https://github.com/facebookresearch/sam2/blob/main/sam2/sam2_image_predictor.py#L291>`__ method. So, we try to export like this.
+
+.. code:: python
+
+   ep = torch.export.export(
+       self._predict,
+       args=(unnorm_coords, labels, unnorm_box, mask_input, multimask_output),
+       kwargs={"return_logits": return_logits},
+       strict=False,
+   )
+
+
+Error: Model is not of type ``torch.nn.Module``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+``torch.export`` expects the module to be of type ``torch.nn.Module``. However, the module we are trying to export is a class method. Hence it errors.
+
+.. code:: console
+
+   Traceback (most recent call last):
+     File "/sam2/image_predict.py", line 20, in <module>
+       masks, scores, _ = predictor.predict(
+     File "/sam2/sam2/sam2_image_predictor.py", line 312, in predict
+       ep = torch.export.export(
+     File "python3.10/site-packages/torch/export/__init__.py", line 359, in export
+       raise ValueError(
+   ValueError: Expected `mod` to be an instance of `torch.nn.Module`, got <class 'method'>.
+
+
+Solution
+~~~~~~~~
+
+We write a helper class, which inherits from ``torch.nn.Module`` and call the ``_predict method`` in the ``forward`` method of the class. The complete code can be found `here <https://github.com/anijain2305/sam2/blob/ued/sam2/sam2_image_predictor.py#L293-L311>`__.
+
+.. code:: python
+
+   class ExportHelper(torch.nn.Module):
+       def __init__(self):
+           super().__init__()
+
+       def forward(_, *args, **kwargs):
+           return self._predict(*args, **kwargs)
+
+    model_to_export = ExportHelper()
+    ep = torch.export.export(
+         model_to_export,
+         args=(unnorm_coords, labels, unnorm_box, mask_input,  multimask_output),
+         kwargs={"return_logits": return_logits},
+         strict=False,
+         )
+
+Conclusion
+----------
+
+In this tutorial, we have learned how to use ``torch.export`` to export models for popular use cases by addressing challenges through correct configuration and simple code modifications.
+Once you are able to export a model, you can lower the ``ExportedProgram`` into your hardware using `AOTInductor <https://pytorch.org/docs/stable/torch.compiler_aot_inductor.html>`__ in case of servers and `ExecuTorch <https://pytorch.org/executorch/stable/index.html>`__ in case of edge device.
+To learn more about ``AOTInductor`` (AOTI), please refer to the `AOTI tutorial <https://pytorch.org/tutorials/recipes/torch_export_aoti_python.html>`__.
+To learn more about ``ExecuTorch`` , please refer to the `ExecuTorch tutorial <https://pytorch.org/executorch/stable/tutorials/export-to-executorch-tutorial.html>`__.
diff --git a/recipes_source/torch_logs.py b/recipes_source/torch_logs.py
new file mode 100644
index 00000000000..b5c3f0bd8ac
--- /dev/null
+++ b/recipes_source/torch_logs.py
@@ -0,0 +1,96 @@
+"""
+(beta) Using TORCH_LOGS python API with torch.compile
+==========================================================================================
+**Author:** `Michael Lazos <https://github.com/mlazos>`_
+"""
+
+import logging
+
+######################################################################
+#
+# This tutorial introduces the ``TORCH_LOGS`` environment variable, as well as the Python API, and
+# demonstrates how to apply it to observe the phases  of ``torch.compile``.
+#
+# .. note::
+#
+#   This tutorial requires PyTorch 2.2.0 or later.
+#
+#
+
+
+######################################################################
+# Setup
+# ~~~~~~~~~~~~~~~~~~~~~
+# In this example, we'll set up a simple Python function which performs an elementwise
+# add and observe the compilation process with ``TORCH_LOGS`` Python API.
+#
+# .. note::
+#
+#   There is also an environment variable ``TORCH_LOGS``, which can be used to
+#   change logging settings at the command line. The equivalent environment
+#   variable setting is shown for each example.
+
+import torch
+
+# exit cleanly if we are on a device that doesn't support torch.compile
+if torch.cuda.get_device_capability() < (7, 0):
+    print("Skipping because torch.compile is not supported on this device.")
+else:
+    @torch.compile()
+    def fn(x, y):
+        z = x + y
+        return z + 2
+
+
+    inputs = (torch.ones(2, 2, device="cuda"), torch.zeros(2, 2, device="cuda"))
+
+
+# print separator and reset dynamo
+# between each example
+    def separator(name):
+        print(f"==================={name}=========================")
+        torch._dynamo.reset()
+
+
+    separator("Dynamo Tracing")
+# View dynamo tracing
+# TORCH_LOGS="+dynamo"
+    torch._logging.set_logs(dynamo=logging.DEBUG)
+    fn(*inputs)
+
+    separator("Traced Graph")
+# View traced graph
+# TORCH_LOGS="graph"
+    torch._logging.set_logs(graph=True)
+    fn(*inputs)
+
+    separator("Fusion Decisions")
+# View fusion decisions
+# TORCH_LOGS="fusion"
+    torch._logging.set_logs(fusion=True)
+    fn(*inputs)
+
+    separator("Output Code")
+# View output code generated by inductor
+# TORCH_LOGS="output_code"
+    torch._logging.set_logs(output_code=True)
+    fn(*inputs)
+
+    separator("")
+
+######################################################################
+# Conclusion
+# ~~~~~~~~~~
+#
+# In this tutorial we introduced the TORCH_LOGS environment variable and python API
+# by experimenting with a small number of the available logging options.
+# To view descriptions of all available options, run any python script
+# which imports torch and set TORCH_LOGS to "help".
+#
+# Alternatively, you can view the `torch._logging documentation`_ to see
+# descriptions of all available logging options.
+#
+# For more information on torch.compile, see the `torch.compile tutorial`_.
+#
+# .. _torch._logging documentation: https://pytorch.org/docs/main/logging.html
+# .. _torch.compile tutorial: https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html
diff --git a/recipes_source/torchscript_inference.rst b/recipes_source/torchscript_inference.rst
new file mode 100644
index 00000000000..01bc497d38e
--- /dev/null
+++ b/recipes_source/torchscript_inference.rst
@@ -0,0 +1,6 @@
+..
+  TODO(gmagogsfm): Replace/delete this document by 2.9 release. https://github.com/pytorch/tutorials/issues/3456
+
+.. warning::
+    TorchScript is deprecated, please use 
+    `torch.export <https://docs.pytorch.org/tutorials/intermediate/torch_export_tutorial.html>`__ instead.
\ No newline at end of file
diff --git a/recipes_source/xeon_run_cpu.rst b/recipes_source/xeon_run_cpu.rst
new file mode 100644
index 00000000000..9ff14be08e3
--- /dev/null
+++ b/recipes_source/xeon_run_cpu.rst
@@ -0,0 +1,363 @@
+Optimizing CPU Performance on Intel® Xeon® with run_cpu Script
+======================================================================
+
+There are several configuration options that can impact the performance of PyTorch inference when executed on Intel® Xeon® Scalable Processors.
+To get peak performance, the ``torch.backends.xeon.run_cpu`` script is provided that optimizes the configuration of thread and memory management.
+For thread management, the script configures thread affinity and the preload of Intel® OMP library.
+For memory management, it configures NUMA binding and preloads optimized memory allocation libraries, such as TCMalloc and JeMalloc.
+In addition, the script provides tunable parameters for compute resource allocation in both single instance and multiple instance scenarios,
+helping the users try out an optimal coordination of resource utilization for the specific workloads.
+
+What You Will Learn
+-------------------
+
+* How to utilize tools like ``numactl``, ``taskset``, Intel® OpenMP Runtime Library and optimized memory
+  allocators such as ``TCMalloc`` and ``JeMalloc`` for enhanced performance.
+* How to configure CPU resources and memory management to maximize PyTorch inference performance on Intel® Xeon® processors.
+
+Introduction of the Optimizations
+---------------------------------
+
+Applying NUMA Access Control
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+It is beneficial that an increasing number of CPU cores are being provided to users within a single socket, as this offers greater computational resources.
+However, this also leads to competition for memory access, which can cause programs to stall due to busy memory.
+To address this problem, Non-Uniform Memory Access (NUMA) was introduced.
+Unlike Uniform Memory Access (UMA), where all memories are equally accessible to all cores,
+NUMA organizes memory into multiple groups. Certain number of memories are directly attached to one socket's integrated memory controller to become local memory of this socket.
+Local memory access is much faster than remote memory access.
+
+Users can get CPU information with ``lscpu`` command on Linux to learn how many cores and sockets are there on the machine.
+Additionally, this command provides NUMA information, such as the distribution of CPU cores.
+Below is an example of executing  ``lscpu`` on a machine equipped with an Intel® Xeon® CPU Max 9480:
+
+.. code-block:: console
+
+   $ lscpu
+   ...
+   CPU(s):                  224
+     On-line CPU(s) list:   0-223
+   Vendor ID:               GenuineIntel
+     Model name:            Intel (R) Xeon (R) CPU Max 9480
+       CPU family:          6
+       Model:               143
+       Thread(s) per core:  2
+       Core(s) per socket:  56
+       Socket(s):           2
+   ...
+   NUMA:
+     NUMA node(s):          2
+     NUMA node0 CPU(s):     0-55,112-167
+     NUMA node1 CPU(s):     56-111,168-223
+   ...
+
+* Two sockets were detected, each containing 56 physical cores. With Hyper-Threading enabled, each core can handle 2 threads, resulting in 56 logical cores per socket. Therefore, the machine has a total of 224 CPU cores in service.
+* Typically, physical cores are indexed before logical cores. In this scenario, cores 0-55 are the physical cores on the first NUMA node, and cores 56-111 are the physical cores on the second NUMA node.
+* Logical cores are indexed subsequently: cores 112-167 correspond to the logical cores on the first NUMA node, and cores 168-223 to those on the second NUMA node.
+
+Typically, running PyTorch programs with compute intense workloads should avoid using logical cores to get good performance.
+
+Linux provides a tool called ``numactl`` that allows user control of NUMA policy for processes or shared memory.
+It runs processes with a specific NUMA scheduling or memory placement policy.
+As described above, cores share high-speed cache in one socket, thus it is a good idea to avoid cross socket computations.
+From a memory access perspective, bounding memory access locally is much faster than accessing remote memories.
+``numactl`` command should have been installed in recent Linux distributions. In case it is missing, you can install it manually with the installation command, like on Ubuntu:
+
+.. code-block:: console
+
+   $ apt-get install numactl
+
+on CentOS you can run the following command:
+
+.. code-block:: console
+
+   $ yum install numactl
+
+The ``taskset`` command in Linux is another powerful utility that allows you to set or retrieve the CPU affinity of a running process.
+``taskset`` are pre-installed in most Linux distributions and in case it's not, on Ubuntu you can install it with the command:
+
+.. code-block:: console
+
+   $ apt-get install util-linux
+
+on CentOS you can run the following command:
+
+.. code-block:: console
+
+   $ yum install util-linux
+
+Using Intel® OpenMP Runtime Library
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+OpenMP is an implementation of multithreading, a method of parallelizing where a primary thread (a series of instructions executed consecutively) forks a specified number of sub-threads and the system divides a task among them. The threads then run concurrently, with the runtime environment allocating threads to different processors.
+Users can control OpenMP behaviors with some environment variable settings to fit for their workloads, the settings are read and executed by OMP libraries. By default, PyTorch uses GNU OpenMP Library (GNU libgomp) for parallel computation. On Intel® platforms, Intel® OpenMP Runtime Library (libiomp) provides OpenMP API specification support. It usually brings more performance benefits compared to libgomp.
+
+The Intel® OpenMP Runtime Library can be installed using one of these commands:
+
+.. code-block:: console
+
+   $ pip install intel-openmp
+
+or
+
+.. code-block:: console
+
+   $ conda install mkl
+
+Choosing an Optimized Memory Allocator
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Memory allocator plays an important role from performance perspective as well. A more efficient memory usage reduces overhead on unnecessary memory allocations or destructions, and thus results in a faster execution. From practical experiences, for deep learning workloads, ``TCMalloc`` or ``JeMalloc`` can get better performance by reusing memory as much as possible than default malloc operations.
+
+You can install ``TCMalloc`` by running the following command on Ubuntu:
+
+.. code-block:: console
+
+   $ apt-get install google-perftools
+
+On CentOS, you can install it by running:
+
+.. code-block:: console
+
+   $ yum install gperftools
+
+In a conda environment, it can also be installed by running:
+
+.. code-block:: console
+
+   $ conda install conda-forge::gperftools
+
+On Ubuntu ``JeMalloc`` can be installed by this command:
+
+.. code-block:: console
+
+   $ apt-get install libjemalloc2
+
+On CentOS it can be installed by running:
+
+.. code-block:: console
+
+   $ yum install jemalloc
+
+In a conda environment, it can also be installed by running:
+
+.. code-block:: console
+
+   $ conda install conda-forge::jemalloc
+
+Quick Start Example Commands
+----------------------------
+
+1. To run single-instance inference with 1 thread on 1 CPU core (only Core #0 would be used):
+
+.. code-block:: console
+
+   $ python -m torch.backends.xeon.run_cpu --ninstances 1 --ncores-per-instance 1 <program.py> [program_args]
+
+2. To run single-instance inference on a single CPU node (NUMA socket):
+
+.. code-block:: console
+
+   $ python -m torch.backends.xeon.run_cpu --node-id 0 <program.py> [program_args]
+
+3. To run multi-instance inference, 8 instances with 14 cores per instance on a 112-core CPU:
+
+.. code-block:: console
+
+   $ python -m torch.backends.xeon.run_cpu --ninstances 8 --ncores-per-instance 14 <program.py> [program_args]
+
+4. To run inference in throughput mode, in which all the cores in each CPU node set up an instance:
+
+.. code-block:: console
+
+   $ python -m torch.backends.xeon.run_cpu --throughput-mode <program.py> [program_args]
+
+.. note::
+
+   Term "instance" here doesn't refer to a cloud instance. This script is executed as a single process which invokes multiple "instances" which are formed from multiple threads. "Instance" is kind of group of threads in this context.
+
+Using ``torch.backends.xeon.run_cpu``
+-------------------------------------
+
+The argument list and usage guidance can be shown with the following command:
+
+.. code-block:: console
+
+   $ python -m torch.backends.xeon.run_cpu –h
+   usage: run_cpu.py [-h] [--multi-instance] [-m] [--no-python] [--enable-tcmalloc] [--enable-jemalloc] [--use-default-allocator] [--disable-iomp] [--ncores-per-instance] [--ninstances] [--skip-cross-node-cores] [--rank] [--latency-mode] [--throughput-mode] [--node-id] [--use-logical-core] [--disable-numactl] [--disable-taskset] [--core-list] [--log-path] [--log-file-prefix] <program> [program_args]
+
+The command above has the following positional arguments:
+
+.. list-table::
+   :widths: 25 50
+   :header-rows: 1
+
+   * - knob
+     - help
+   * - ``program``
+     - The full path of the program/script to be launched.
+   * - ``program_args``
+     - The input arguments for the program/script to be launched.
+
+Explanation of the options
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The generic option settings (knobs) include the following:
+
+.. list-table::
+   :widths: 25 10 15 50
+   :header-rows: 1
+
+   * - knob
+     - type
+     - default value
+     - help
+   * - ``-h``, ``--help``
+     - 
+     - 
+     - To show the help message and exit.
+   * - ``-m``, ``--module``
+     - 
+     - 
+     - To change each process to interpret the launch script as a python module, executing with the same behavior as "python -m".
+   * - ``--no-python``
+     - bool
+     - False
+     - To avoid prepending the program with "python" - just execute it directly. Useful when the script is not a Python script.
+   * - ``--log-path``
+     - str
+     - ``''``
+     - To specify the log file directory. Default path is ``''``, which means disable logging to files.
+   * - ``--log-file-prefix``
+     - str
+     - "run"
+     - Prefix of the log file name.
+
+Knobs for applying or disabling optimizations are:
+
+.. list-table::
+   :widths: 25 10 15 50
+   :header-rows: 1
+
+   * - knob
+     - type
+     - default value
+     - help
+   * - ``--enable-tcmalloc``
+     - bool
+     - False
+     - To enable ``TCMalloc`` memory allocator.
+   * - ``--enable-jemalloc``
+     - bool
+     - False
+     - To enable ``JeMalloc`` memory allocator.
+   * - ``--use-default-allocator``
+     - bool
+     - False
+     - To use default memory allocator. Neither ``TCMalloc`` nor ``JeMalloc`` would be used.
+   * - ``--disable-iomp``
+     - bool
+     - False
+     - By default, Intel® OpenMP lib will be used if installed. Setting this flag would disable the usage of Intel® OpenMP.
+
+.. note::
+
+   Memory allocators influence performance. If the user does not specify a desired memory allocator, the ``run_cpu`` script will search if any of them is installed in the order of TCMalloc > JeMalloc > PyTorch default memory allocator, and takes the first matched one.
+
+Knobs for controlling instance number and compute resource allocation are:
+
+.. list-table::
+   :widths: 25 10 15 50
+   :header-rows: 1
+
+   * - knob
+     - type
+     - default value
+     - help
+   * - ``--ninstances``
+     - int
+     - 0
+     - Number of instances.
+   * - ``--ncores-per-instance``
+     - int
+     - 0
+     - Number of cores used by each instance.
+   * - ``--node-id``
+     - int
+     - -1
+     - The node ID to be used for multi-instance, by default all nodes will be used.
+   * - ``--core-list``
+     - str
+     - ``''``
+     - To specify the core list as ``'core_id, core_id, ....'`` or core range as ``'core_id-core_id'``. By dafault all the cores will be used.
+   * - ``--use-logical-core``
+     - bool
+     - False
+     - By default only physical cores are used. Specifying this flag enables logical cores usage.
+   * - ``--skip-cross-node-cores``
+     - bool
+     - False
+     - To prevent the workload to be executed on cores across NUMA nodes.
+   * - ``--rank``
+     - int
+     - -1
+     - To specify instance index to assign ncores_per_instance for rank; otherwise ncores_per_instance will be assigned sequentially to the instances.
+   * - ``--multi-instance``
+     - bool
+     - False
+     - A quick set to invoke multiple instances of the workload on multi-socket CPU servers.
+   * - ``--latency-mode``
+     - bool
+     - False
+     - A quick set to invoke benchmarking with latency mode, in which all physical cores are used and 4 cores per instance.
+   * - ``--throughput-mode``
+     - bool
+     - False
+     - A quick set to invoke benchmarking with throughput mode, in which all physical cores are used and 1 numa node per instance.
+   * - ``--disable-numactl``
+     - bool
+     - False
+     - By default ``numactl`` command is used to control NUMA access. Setting this flag will disable it.
+   * - ``--disable-taskset``
+     - bool
+     - False
+     - To disable the usage of ``taskset`` command.
+	 
+.. note::
+
+   Environment variables that will be set by this script include the following:
+
+   .. list-table::
+      :widths: 25 50
+      :header-rows: 1
+
+      * - Environment Variable
+        - Value
+      * - LD_PRELOAD
+        - Depending on knobs you set, <lib>/libiomp5.so, <lib>/libjemalloc.so, <lib>/libtcmalloc.so might be appended to LD_PRELOAD.
+      * - KMP_AFFINITY
+        - If libiomp5.so is preloaded, KMP_AFFINITY could be set to ``"granularity=fine,compact,1,0"``.
+      * - KMP_BLOCKTIME
+        - If libiomp5.so is preloaded, KMP_BLOCKTIME is set to "1".
+      * - OMP_NUM_THREADS
+        - Value of ``ncores_per_instance``
+      * - MALLOC_CONF
+        - If libjemalloc.so is preloaded, MALLOC_CONF will be set to ``"oversize_threshold:1,background_thread:true,metadata_thp:auto"``.
+		
+   Please note that the script respects environment variables set preliminarily. For example, if you have set the environment variables mentioned above before running the script, the values of the variables will not be overwritten by the script.
+
+Conclusion
+----------
+
+In this tutorial, we explored a variety of advanced configurations and tools designed to optimize PyTorch inference performance on Intel® Xeon® Scalable Processors. 
+By leveraging the ``torch.backends.xeon.run_cpu`` script, we demonstrated how to fine-tune thread and memory management to achieve peak performance.
+We covered essential concepts such as NUMA access control, optimized memory allocators like ``TCMalloc`` and ``JeMalloc``, and the use of Intel® OpenMP for efficient multithreading.
+
+Additionally, we provided practical command-line examples to guide you through setting up single and multiple instance scenarios, ensuring optimal resource utilization tailored to specific workloads.
+By understanding and applying these techniques, users can significantly enhance the efficiency and speed of their PyTorch applications on Intel® Xeon® platforms.
+
+See also:
+
+* `PyTorch Performance Tuning Guide <https://pytorch.org/tutorials/recipes/recipes/tuning_guide.html#cpu-specific-optimizations>`__
+* `PyTorch Multiprocessing Best Practices <https://pytorch.org/docs/stable/notes/multiprocessing.html#cpu-in-multiprocessing>`__
diff --git a/recipes_source/zero_redundancy_optimizer.rst b/recipes_source/zero_redundancy_optimizer.rst
new file mode 100644
index 00000000000..bfc076d58e7
--- /dev/null
+++ b/recipes_source/zero_redundancy_optimizer.rst
@@ -0,0 +1,154 @@
+Shard Optimizer States with ZeroRedundancyOptimizer
+===================================================
+
+In this recipe, you will learn:
+
+- The high-level idea of `ZeroRedundancyOptimizer <https://pytorch.org/docs/master/distributed.optim.html>`__.
+- How to use `ZeroRedundancyOptimizer <https://pytorch.org/docs/master/distributed.optim.html>`__
+  in distributed training and its impact.
+
+
+Requirements
+------------
+
+- PyTorch 1.8+
+- `Getting Started With Distributed Data Parallel <https://pytorch.org/tutorials/intermediate/ddp_tutorial.html>`_
+
+
+What is ``ZeroRedundancyOptimizer``?
+------------------------------------
+
+The idea of `ZeroRedundancyOptimizer <https://pytorch.org/docs/master/distributed.optim.html>`__
+comes from `DeepSpeed/ZeRO project <https://github.com/microsoft/DeepSpeed>`_ and
+`Marian <https://github.com/marian-nmt/marian-dev>`_ that shard
+optimizer states across distributed data-parallel processes to
+reduce per-process memory footprint. In the
+`Getting Started With Distributed Data Parallel <https://pytorch.org/tutorials/intermediate/ddp_tutorial.html>`_
+tutorial, we have shown how to use
+`DistributedDataParallel <https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html>`_
+(DDP) to train models. In that tutorial, each process keeps a dedicated replica
+of the optimizer. Since DDP has already synchronized gradients in the
+backward pass, all optimizer replicas will operate on the same parameter and
+gradient values in every iteration, and this is how DDP keeps model replicas in
+the same state. Oftentimes, optimizers also maintain local states. For example,
+the ``Adam`` optimizer uses per-parameter ``exp_avg`` and ``exp_avg_sq`` states. As a
+result, the ``Adam`` optimizer's memory consumption is at least twice the model
+size. Given this observation, we can reduce the optimizer memory footprint by
+sharding optimizer states across DDP processes. More specifically, instead of
+creating per-param states for all parameters, each optimizer instance in
+different DDP processes only keeps optimizer states for a shard of all model
+parameters. The optimizer ``step()`` function only updates the parameters in its
+shard and then broadcasts its updated parameters to all other peer DDP
+processes, so that all model replicas still land in the same state.
+
+How to use ``ZeroRedundancyOptimizer``?
+---------------------------------------
+
+The code below demonstrates how to use
+`ZeroRedundancyOptimizer <https://pytorch.org/docs/master/distributed.optim.html>`__.
+The majority of the code is similar to the simple DDP example presented in
+`Distributed Data Parallel notes <https://pytorch.org/docs/stable/notes/ddp.html>`_.
+The main difference is the ``if-else`` clause in the ``example`` function which
+wraps optimizer constructions, toggling between
+`ZeroRedundancyOptimizer <https://pytorch.org/docs/master/distributed.optim.html>`__
+and ``Adam`` optimizer.
+
+
+::
+
+    import os
+    import torch
+    import torch.distributed as dist
+    import torch.multiprocessing as mp
+    import torch.nn as nn
+    import torch.optim as optim
+    from torch.distributed.optim import ZeroRedundancyOptimizer
+    from torch.nn.parallel import DistributedDataParallel as DDP
+
+    def print_peak_memory(prefix, device):
+        if device == 0:
+            print(f"{prefix}: {torch.cuda.max_memory_allocated(device) // 1e6}MB ")
+
+    def example(rank, world_size, use_zero):
+        torch.manual_seed(0)
+        torch.cuda.manual_seed(0)
+        os.environ['MASTER_ADDR'] = 'localhost'
+        os.environ['MASTER_PORT'] = '29500'
+        # create default process group
+        dist.init_process_group("gloo", rank=rank, world_size=world_size)
+
+        # create local model
+        model = nn.Sequential(*[nn.Linear(2000, 2000).to(rank) for _ in range(20)])
+        print_peak_memory("Max memory allocated after creating local model", rank)
+
+        # construct DDP model
+        ddp_model = DDP(model, device_ids=[rank])
+        print_peak_memory("Max memory allocated after creating DDP", rank)
+
+        # define loss function and optimizer
+        loss_fn = nn.MSELoss()
+        if use_zero:
+            optimizer = ZeroRedundancyOptimizer(
+                ddp_model.parameters(),
+                optimizer_class=torch.optim.Adam,
+                lr=0.01
+            )
+        else:
+            optimizer = torch.optim.Adam(ddp_model.parameters(), lr=0.01)
+
+        # forward pass
+        outputs = ddp_model(torch.randn(20, 2000).to(rank))
+        labels = torch.randn(20, 2000).to(rank)
+        # backward pass
+        loss_fn(outputs, labels).backward()
+
+        # update parameters
+        print_peak_memory("Max memory allocated before optimizer step()", rank)
+        optimizer.step()
+        print_peak_memory("Max memory allocated after optimizer step()", rank)
+
+        print(f"params sum is: {sum(model.parameters()).sum()}")
+
+
+
+    def main():
+        world_size = 2
+        print("=== Using ZeroRedundancyOptimizer ===")
+        mp.spawn(example,
+            args=(world_size, True),
+            nprocs=world_size,
+            join=True)
+
+        print("=== Not Using ZeroRedundancyOptimizer ===")
+        mp.spawn(example,
+            args=(world_size, False),
+            nprocs=world_size,
+            join=True)
+
+    if __name__=="__main__":
+        main()
+
+The output is shown below. When enabling ``ZeroRedundancyOptimizer`` with ``Adam``,
+the optimizer ``step()`` peak memory consumption is half of vanilla ``Adam``'s
+memory consumption. This agrees with our expectation, as we are sharding
+``Adam`` optimizer states across two processes. The output also shows that, with
+``ZeroRedundancyOptimizer``, the model parameters still end up with the same
+values after one iterations (the parameters sum is the same with and without
+``ZeroRedundancyOptimizer``).
+
+::
+
+    === Using ZeroRedundancyOptimizer ===
+    Max memory allocated after creating local model: 335.0MB
+    Max memory allocated after creating DDP: 656.0MB
+    Max memory allocated before optimizer step(): 992.0MB
+    Max memory allocated after optimizer step(): 1361.0MB
+    params sum is: -3453.6123046875
+    params sum is: -3453.6123046875
+    === Not Using ZeroRedundancyOptimizer ===
+    Max memory allocated after creating local model: 335.0MB
+    Max memory allocated after creating DDP: 656.0MB
+    Max memory allocated before optimizer step(): 992.0MB
+    Max memory allocated after optimizer step(): 1697.0MB
+    params sum is: -3453.6123046875
+    params sum is: -3453.6123046875
diff --git a/redirects.py b/redirects.py
new file mode 100644
index 00000000000..1ffa68e6b85
--- /dev/null
+++ b/redirects.py
@@ -0,0 +1,54 @@
+redirects = {
+    "advanced/cpp_extension.html": "https://docs.pytorch.org/tutorials/advanced/custom_ops_landing_page.html",
+    "advanced/cpp_cuda_graphs.html": "../index.html",
+    "advanced/dynamic_quantization_tutorial.html": "../index.html",
+    "advanced/static_quantization_tutorial.html": "../index.html",
+    "advanced/super_resolution_with_onnxruntime.html": "../index.html",
+    "advanced/torch_script_custom_classes": "../index.html",
+    "advanced_source/static_quantization_tutorial.rst": "../index.html",
+    "beginner/Intro_to_TorchScript_tutorial.html": "../index.html",
+    "beginner/deploy_seq2seq_hybrid_frontend_tutorial.html": "../index.html",
+    "beginner/hybrid_frontend_tutorial": "../index.html",
+    "beginner/hybrid_frontend/learning_hybrid_frontend_through_example_tutorial.html": "../../index.html",
+    "beginner/flava_finetuning_tutorial.html": "../index.html",
+    "beginner/ptcheat.html": "../index.html",
+    "intermediate/FSDP_adavnced_tutorial.html": "https://docs.pytorch.org/tutorials/intermediate/FSDP_advanced_tutorial.html",
+    "intermediate/dynamic_quantization_bert_tutorial.html": "../index.html",
+    "intermediate/flask_rest_api_tutorial": "../index.html",
+    "intermediate/quantized_transfer_learning_tutorial.html": "../index.html",
+    "intermediate/torchserve_with_ipex.html": "../index.html",
+    "intermediate/torchserve_with_ipex_2.html": "../index.html",
+    "intermediate/tiatoolbox_tutorial.html": "../index.html",
+    "prototype/backend_config_tutorial.html": "../index.html",
+    "prototype/flight_recorder_tutorial.html": "https://docs.pytorch.org/tutorials/unstable/flight_recorder_tutorial.html",
+    "prototype/fx_graph_mode_ptq_dynamic.html": "../index.html",
+    "prototype/fx_graph_mode_ptq_static.html": "../index.html",
+    "prototype/fx_graph_mode_quant_guide.html": "../index.html",
+    "prototype/graph_mode_dynamic_bert_tutorial.html": "../index.html",
+    "prototype/inductor_windows.html": "https://pytorch.org/tutorials/unstable/inductor_windows.html",
+    "prototype/numeric_suite_tutorial.html": "../index.html",
+    "prototype/openvino_quantizer.html": "https://docs.pytorch.org/ao/main/tutorials_source/pt2e_quant_openvino_inductor.html",
+    "prototype/pt2e_quant_ptq.html": "https://docs.pytorch.org/ao/main/tutorials_source/pt2e_quant_ptq.html",
+    "prototype/pt2e_quant_ptq_x86_inductor.html": "https://docs.pytorch.org/ao/main/tutorials_source/pt2e_quant_x86_inductor.html",
+    "prototype/pt2e_quant_qat.html": "https://docs.pytorch.org/ao/main/tutorials_source/pt2e_quant_qat.html",
+    "prototype/pt2e_quant_x86_inductor.html": "https://docs.pytorch.org/ao/main/tutorials_source/pt2e_quant_x86_inductor.html",
+    "prototype/pt2e_quant_xpu_inductor.html": "https://docs.pytorch.org/ao/main/tutorials_source/pt2e_quant_xpu_inductor.html",
+    "prototype/pt2e_quantizer.html": "https://docs.pytorch.org/ao/main/tutorials_source/pt2e_quantizer.html",
+    "prototype/quantization_in_pytorch_2_0_export_tutorial.html": "../index.html",
+    "prototype/torchscript_freezing.html": "../index.html",
+    "recipes_source/cuda_rpc.rst": "../index.html",
+    "receipes/fuse.html": "../index.html",
+    "receipes/quantization.html": "../index.html",
+    "receipes/receipes/dynamic_quantization.html": "../index.html",
+    "recipes/bundled_inputs.html": "../index.html",
+    "recipes/inference_tuning_on_aws_graviton.html": "../index.html",
+    "recipes/recipes_index.html": "../recipes_index.html",
+    "recipes/intel_extension_for_pytorch.html": "../index.html",
+    "recipes/torch_compile_backend_ipex.html": "../index.html",
+    "recipes/torchserve_vertexai_tutorial.html": "../index.html",
+    "recipes/amx.html": "../index.html",
+    "unstable_source/vulkan_workflow.rst": "../index.html",
+    "unstable/semi_structured_sparse.html": "https://docs.pytorch.org/tutorials/advanced/semi_structured_sparse.html",
+    "unstable/skip_param_init.html": "https://docs.pytorch.org/tutorials/recipes/recipes/module_load_state_dict_tips.html",
+    "unstable_source/backend_config_tutorial.rst": "../index.html",
+}
diff --git a/requirements.txt b/requirements.txt
deleted file mode 100644
index 88bf19dee92..00000000000
--- a/requirements.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-sphinx
-sphinx_rtd_theme
-sphinx-gallery
-numpy
-matplotlib
-torchvision
-torch
diff --git a/requirements.txt b/requirements.txt
new file mode 120000
index 00000000000..72b541c1ebf
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1 @@
+.ci/docker/requirements.txt
\ No newline at end of file
diff --git a/runtime.txt b/runtime.txt
new file mode 100644
index 00000000000..cc1923a40b1
--- /dev/null
+++ b/runtime.txt
@@ -0,0 +1 @@
+3.8
diff --git a/tools/linter/adapters/run_from_link.py b/tools/linter/adapters/run_from_link.py
new file mode 100644
index 00000000000..57c2f89f9a5
--- /dev/null
+++ b/tools/linter/adapters/run_from_link.py
@@ -0,0 +1,81 @@
+import argparse
+import subprocess
+import urllib.request
+from pathlib import Path
+
+
+REPO_ROOT = Path(__file__).absolute().parents[3]
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Use a formatter in a different repository.",
+    )
+    parser.add_argument(
+        "--run-init",
+        action="store_true",
+        help="Run the initialization script specified by --init-name.",
+    )
+    parser.add_argument(
+        "--run-lint",
+        action="store_true",
+        help="Run the linting script specified by --lint-name.",
+    )
+    parser.add_argument(
+        "--init-name",
+        help="Name of the initialization script.  This also serves as the filename.",
+    )
+    parser.add_argument(
+        "--init-link",
+        help="URL to download the initialization script from.",
+    )
+    parser.add_argument(
+        "--lint-name",
+        help="Name of the linting script.  This also serves as the filename.",
+    )
+    parser.add_argument(
+        "--lint-link",
+        help="URL to download the linting script from.",
+    )
+
+    parser.add_argument("args_for_file", nargs=argparse.REMAINDER)
+    args = parser.parse_args()
+    # Skip the first -- if present
+    if args.args_for_file and args.args_for_file[0] == "--":
+        args.args_for_file = args.args_for_file[1:]
+    return args
+
+
+def download_file(url: str, location: Path) -> bytes:
+    response = urllib.request.urlopen(url)
+    content = response.read()
+    location.write_bytes(content)
+    return content
+
+
+def main() -> None:
+    args = parse_args()
+
+    location = REPO_ROOT / ".lintbin" / "from_link" / "adapters"
+    location.mkdir(parents=True, exist_ok=True)
+
+    if args.lint_link:
+        download_file(args.lint_link, location / args.lint_name)
+
+    if args.init_link:
+        download_file(args.init_link, location / args.init_name)
+
+    if args.run_init:
+        # Save the content to a file named after the name argument
+        subprocess_args = ["python3", location / args.init_name] + args.args_for_file
+        subprocess.run(subprocess_args, check=True)
+    if args.run_lint:
+        subprocess_args = ["python3", location / args.lint_name] + args.args_for_file
+        subprocess.run(
+            subprocess_args,
+            check=True,
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tutorial_submission_policy.md b/tutorial_submission_policy.md
new file mode 100644
index 00000000000..4717ec438d4
--- /dev/null
+++ b/tutorial_submission_policy.md
@@ -0,0 +1,106 @@
+# PyTorch Tutorial Submission Policy
+
+This policy outlines the criteria and process for submitting new
+tutorials to the PyTorch community.
+Our goal is to ensure that all tutorials are of high quality,
+relevant, and up-to-date, supporting both the growth of the PyTorch
+users and the evolution of the PyTorch framework itself. By following
+these guidelines, contributors can help us maintain a robust and
+informative educational environment.
+
+## Acceptance Criteria For New Tutorials
+
+We accept new tutorials that adhere to one of the following use cases:
+
+* **Demonstrate New PyTorch Features:** Tutorials that support new features
+  for upcoming PyTorch releases are typically authored by the engineers who
+  are developing these features. These tutorials are crucial for showcasing
+  the latest advancements in PyTorch. We typically do not require more than
+  one tutorial per feature.
+
+* **Tutorials showcasing PyTorch usage with other tools and libraries:** We
+  accept community-contributed tutorials that illustrate innovative uses of
+  PyTorch alongside other open-source projects, models, and tools. Please
+  ensure that your tutorial remains neutral and does not promote or endorse
+  proprietary technologies over others.
+
+The first use case does not require going through the submission
+process outlined below. If your tutorial falls under the second category,
+please read and follow the instructions in the
+**Submission Process For Community-Contributed Tutorials** section.
+
+## Submission Process For Community-Contributed Tutorials
+
+To maintain the quality and relevance of tutorials, we request that
+community-contributed tutorials undergo a review process. If you are
+interested in contributing a tutorial, please follow these steps:
+
+1. **Create an issue:**
+   * Open an issue in the pytorch/tutorials repository proposing the
+     new tutorial. Clearly explain the importance of the tutorial and
+     confirm that there is no existing tutorial covering the same or
+     similar topic. A tutorial should not disproportionately endorse
+     one technology over another. Please consult with Core Maintainers
+     to ensure your content adheres to these guidelines.
+     Use the provided [ISSUE_TEMPLATE](https://github.com/pytorch/tutorials/blob/main/.github/ISSUE_TEMPLATE/feature-request.yml) for the new tutorial request - select **Feature request** when submitting an issue.
+
+     * If there is an existing tutorial on the topic that you would
+       like to significantly refactor, you can submit a PR. In the
+       description of the PR, explain why the changes are needed and
+       how they improve the tutorial.
+
+   * These issues will be triaged by PyTorch maintainers on a case-by-case basis.
+   * Link any supporting materials including discussions in other repositories.
+
+1. **Await Approval:**
+   * Wait for a response from the PyTorch Tutorials maintainers. A PyTorch
+     tutorial maintainer will review your proposal and
+     determine whether a tutorial on the proposed topic is desirable.
+     A comment and an **approved** label will be added to your issue
+     by a maintainer. The review process for new tutorial PRs submitted
+     without the corresponding issue may take longer.
+
+1. **Adhere to writing and styling guidelines:**
+   * Once approved, follow the guidelines outlined in [CONTRIBUTING.md](https://github.com/pytorch/tutorials/blob/main/CONTRIBUTING.md)
+     and use the provided [template](https://github.com/pytorch/tutorials/blob/main/beginner_source/template_tutorial.py) for creating your tutorial.
+   * Link the issue in which you received approval for your tutorial
+     in the PR.
+   * We accept tutorials in both ``.rst`` (ReStructuredText) and ``.py``
+     (Python) formats. However, unless your tutorial involves using
+     multiple GPU, parallel/distributed training, or requires extended
+     execution time (25 minutes or more), we prefer submissions
+     in Python file format.
+
+## Maintaining Tutorials
+
+When you submit a new tutorial, we encourage you to keep it in sync
+with the latest PyTorch updates and features. Additionally, we may
+contact you to review any PRs, issues, and other related matters to
+ensure the tutorial remains a valuable resource.
+
+Please note the following:
+
+* If a tutorial breaks against the main branch, it will
+  be excluded from the build and an issue will be filed against it,
+  with the author/maintainer notified. If the issue is not resolved
+  within 90 days, the tutorial might be deleted from the repository.
+
+* We recommend that each tutorial is reviewed at least once a year to
+  ensure its relevance.
+
+## Deleting Stale Tutorials
+
+A tutorial might be considered stale when it no longer aligns with
+the latest PyTorch updates, features, or best practices:
+
+* The tutorial is no longer functional due to changes in PyTorch or
+  its dependencies
+* The tutorial has been superseded by a newer, more comprehensive, or
+  more accurate tutorial
+* The tutorial does not run successfully in the (CI), indicating
+  potential compatibility or dependency issues.
+
+If a tutorial is deemed stale, we will attempt to contact the code owner,
+or someone from the tutorial mainatainers might attempt to update it.
+However, if despite those attempts we fail to fix it, the tutorial
+might be removed from the repository.
diff --git a/unstable_index.rst b/unstable_index.rst
new file mode 100644
index 00000000000..6e3cfd4364c
--- /dev/null
+++ b/unstable_index.rst
@@ -0,0 +1,164 @@
+Unstable
+========
+
+API unstable features are not available as part of binary distributions
+like PyPI or Conda (except maybe behind run-time flags). To test these
+features we would, depending on the feature, recommend building PyTorch
+from source (main) or using the nightly wheels that are made
+available on `pytorch.org <https://pytorch.org>`_.
+
+*Level of commitment*: We are committing to gathering high bandwidth
+feedback only on these features. Based on this feedback and potential
+further engagement between community members, we as a community will
+decide if we want to upgrade the level of commitment or to fail fast.
+
+
+.. raw:: html
+
+   <div id="tutorial-cards-container">
+
+    <nav class="navbar navbar-expand-lg navbar-light tutorials-nav col-12">
+        <div class="tutorial-tags-container">
+            <div id="dropdown-filter-tags">
+                <div class="tutorial-filter-menu">
+                    <div class="tutorial-filter filter-btn all-tag-selected" data-tag="all">All</div>
+                </div>
+            </div>
+        </div>
+    </nav>
+
+    <hr class="tutorials-hr">
+
+    <div class="row">
+
+    <div id="tutorial-cards">
+    <div class="list">
+
+.. Add prototype tutorial cards below this line
+
+.. vmap
+
+.. customcarditem::
+   :header: Using torch.vmap
+   :card_description: Learn about torch.vmap, an autovectorizer for PyTorch operations.
+   :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: unstable/vmap_recipe.html
+   :tags: vmap
+
+.. NestedTensor
+
+.. customcarditem::
+   :header: Nested Tensor
+   :card_description: Learn about nested tensors, the new way to batch heterogeneous-length data
+   :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: unstable/nestedtensor.html
+   :tags: NestedTensor
+
+.. MaskedTensor
+
+.. customcarditem::
+   :header: MaskedTensor Overview
+   :card_description: Learn about masked tensors, the source of truth for specified and unspecified values
+   :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: unstable/maskedtensor_overview.html
+   :tags: MaskedTensor
+
+.. customcarditem::
+   :header: Masked Tensor Sparsity
+   :card_description: Learn about how to leverage sparse layouts (e.g. COO and CSR) in MaskedTensor
+   :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: unstable/maskedtensor_sparsity.html
+   :tags: MaskedTensor
+
+.. customcarditem::
+   :header: Masked Tensor Advanced Semantics
+   :card_description: Learn more about Masked Tensor's advanced semantics (reductions and comparing vs. NumPy's MaskedArray)
+   :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: unstable/maskedtensor_advanced_semantics.html
+   :tags: MaskedTensor
+
+.. customcarditem::
+   :header: MaskedTensor: Simplifying Adagrad Sparse Semantics
+   :card_description: See a showcase on how masked tensors can enable sparse semantics and provide for a cleaner dev experience
+   :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: unstable/maskedtensor_adagrad.html
+   :tags: MaskedTensor
+
+.. Model-Optimization
+
+.. customcarditem::
+   :header: Inductor Cpp Wrapper Tutorial
+   :card_description: Speed up your models with Inductor Cpp Wrapper
+   :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: unstable/inductor_cpp_wrapper_tutorial.html
+   :tags: Model-Optimization
+
+.. customcarditem::
+   :header: Inductor Windows CPU Tutorial
+   :card_description: Speed up your models with Inductor On Windows CPU
+   :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: unstable/inductor_windows.html
+   :tags: Model-Optimization
+
+.. customcarditem::
+   :header: Use max-autotune compilation on CPU to gain additional performance boost
+   :card_description: Tutorial for max-autotune mode on CPU to gain additional performance boost
+   :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: unstable/max_autotune_on_CPU_tutorial.html
+   :tags: Model-Optimization
+
+.. Distributed
+.. customcarditem::
+   :header: Flight Recorder Tutorial
+   :card_description: Debug stuck jobs easily with Flight Recorder
+   :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: unstable/flight_recorder_tutorial.html
+   :tags: Distributed, Debugging, FlightRecorder
+
+.. customcarditem::
+   :header: Context Parallel Tutorial
+   :card_description: Parallelize the attention computation along sequence dimension
+   :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: unstable/context_parallel.html
+   :tags: Distributed, Context Parallel
+
+.. Integration
+.. customcarditem::
+   :header: Out-of-tree extension autoloading in Python
+   :card_description: Learn how to improve the seamless integration of out-of-tree extension with PyTorch based on the autoloading mechanism.
+   :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: unstable/python_extension_autoload.html
+   :tags: Extending-PyTorch, Frontend-APIs
+
+.. GPUDirect Storage
+.. customcarditem::
+   :header: (prototype) Using GPUDirect Storage
+   :card_description: Learn how to use GPUDirect Storage in PyTorch.
+   :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: unstable/gpu_direct_storage.html
+   :tags: GPUDirect-Storage
+
+.. End of tutorial card section
+
+.. -----------------------------------------
+.. Page TOC
+.. -----------------------------------------
+
+.. toctree::
+   :maxdepth: 2
+   :hidden:
+
+   unstable/context_parallel
+   unstable/flight_recorder_tutorial
+   unstable/inductor_cpp_wrapper_tutorial
+   unstable/inductor_windows
+   unstable/vmap_recipe
+   unstable/nestedtensor
+   unstable/maskedtensor_overview
+   unstable/maskedtensor_sparsity
+   unstable/maskedtensor_advanced_semantics
+   unstable/maskedtensor_adagrad
+   unstable/python_extension_autoload
+   unstable/gpu_direct_storage.html
+   unstable/max_autotune_on_CPU_tutorial
+   unstable/skip_param_init.html
diff --git a/unstable_source/README.md b/unstable_source/README.md
new file mode 100644
index 00000000000..09f103514a3
--- /dev/null
+++ b/unstable_source/README.md
@@ -0,0 +1,9 @@
+# Prototype Tutorials and Recipes
+
+This directory contains tutorials and recipes demonstrating prototype features in PyTorch. 
+
+**Prototype features** are part of the release and are available as part of the binary distributions such as PyPI or Conda. To test these features you can, depending on the feature, build from the master branch or use the nightly wheels that are made available at pytorch.org. You can also by use the release wheels available from PyPI or Conda. Prototype features are provided as a technical preview and can be altered later on. The PyTorch team does not recommend using them in production pipelines.
+
+These are intentionally left out of the pytorch.org/tutorials build and will not show up on the website.
+
+*Level of commitment:* We are committing to gathering high bandwidth feedback only on these features. Based on this feedback and potential further engagement between community members, we as a community will decide if we want to upgrade the level of commitment or to fail fast. 
diff --git a/unstable_source/README.txt b/unstable_source/README.txt
new file mode 100644
index 00000000000..55a94b43626
--- /dev/null
+++ b/unstable_source/README.txt
@@ -0,0 +1,13 @@
+Prototype Tutorials
+------------------
+1. distributed_rpc_profiling.rst
+           Profiling PyTorch RPC-Based Workloads
+           https://github.com/pytorch/tutorials/blob/main/unstable_source/distributed_rpc_profiling.rst
+
+2. flight_recorder_tutorial.rst
+	   Flight Recorder User Guide
+	   https://pytorch.org/tutorials/prototype/flight_recorder_tutorial.html
+
+3. python_extension_autoload.rst
+	   Autoloading Out-of-Tree Extension
+	   https://github.com/pytorch/tutorials/blob/main/unstable_source/python_extension_autoload.rst
diff --git a/unstable_source/context_parallel.rst b/unstable_source/context_parallel.rst
new file mode 100644
index 00000000000..b12a4030016
--- /dev/null
+++ b/unstable_source/context_parallel.rst
@@ -0,0 +1,228 @@
+Introduction to Context Parallel
+======================================
+**Authors**: `Xilun Wu <https://github.com/XilunWu>`_, `Chien-Chin Huang <https://github.com/fegin>`__
+
+.. note::
+    |edit| View and edit this tutorial in `GitHub <https://github.com/pytorch/tutorials/blob/main/unstable_source/context_parallel.rst>`__.
+
+.. grid:: 2
+
+   .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn
+      :class-card: card-prerequisites
+
+      * `Context Parallel APIs <https://pytorch.org/docs/stable/distributed.tensor.html#torch.distributed.tensor.experimental.context_parallel>`__
+      * `1M sequence training in TorchTitan with Context Parallel <https://discuss.pytorch.org/t/distributed-w-torchtitan-breaking-barriers-training-long-context-llms-with-1m-sequence-length-in-pytorch-using-context-parallel/215082>`__
+
+
+   .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites
+      :class-card: card-prerequisites
+
+      * PyTorch 2.7 or later
+
+
+Introduction
+------------
+
+Context Parallel is an approach used in large language model training to reduce peak activation size by sharding the long input sequence across multiple devices.
+It breaks the constraint on input sequence length resulting from peak memory usage on storing activations in Transformer blocks.
+
+Ring Attention, a novel parallel implementation of the Attention layer, is critical to performant Context Parallel.
+Ring Attention shuffles the KV shards and calculates the partial attention scores, repeats until all KV shards have been used on each device.
+Two Ring Attention variants have been implemented: `the all-gather based pass-KV <https://arxiv.org/abs/2407.21783>`__ and `the all-to-all based pass-KV <https://openreview.net/forum?id=WsRHpHH4s0>`__:
+
+1.  The all-gather based pass-KV algorithm is used in Llama3 training, which initially performs an all-gather on the key and value tensors, followed by computing the attention output for the
+    local query tensor chunk. Our modified all-gather based pass-KV algorithm concurrently all-gathers KV shards and computes attention output for the local query tensor chunk
+    using local key and value tensor chunks, followed by a final computation of attention output for the local query tensor and remaining KV shards. This allows some degree of
+    overlap between the attention computation and the all-gather collective. For example, in the case of Llama3 training, we also shard ``freq_cis`` over the sequence dimension.
+2.  The all-to-all approach uses interleaved all-to-all collectives to ring shuffle KV shards to overlap the SDPA (Scaled Dot Product Attention) computation and the all-to-all communication
+    necessary for the next SDPA.
+
+The Context Parallel APIs consist of two parts:
+
+1.  ``context_parallel()`` allows users to create a Python context where the SDPA function (``torch.nn.functional.scaled_dot_product_attention``)
+    will be automatically replaced with Ring Attention. To shard Tensors along a dimension, simply pass the Tensors and their sharding dimensions to
+    argument ``buffers`` and ``buffer_seq_dims`` respectively. We recommend that users add tensors computing along the sequence dimension to ``buffers``
+    and shard them along this dimension. Taking Llama3 training as an example, missing ``freq_cis`` in ``buffers`` will result in a miscalculated rotary embedding.
+2.  ``set_rotate_method()`` allows users to choose between the all-gather based pass-KV approach and the all-to-all based pass-KV approach.
+
+
+Setup
+---------------------
+
+With ``torch.distributed.tensor.experimental.context_parallel()``, users can easily shard the Tensor input and parallelize the execution of the SDPA function.
+To better demonstrate the usage of this API, we start with a simple code snippet doing SDPA and then parallelize it using the API:
+
+.. code:: python
+
+    import torch
+    import torch.nn.functional as F
+
+    from torch.nn.attention import sdpa_kernel, SDPBackend
+
+
+    def sdpa_example():
+        assert torch.cuda.is_available()
+        torch.cuda.set_device("cuda:0")
+        torch.cuda.manual_seed(0)
+
+        batch = 8
+        nheads = 8
+        qkv_len = 8192
+        dim = 32
+        backend = SDPBackend.FLASH_ATTENTION
+        dtype = (
+            torch.bfloat16
+            if backend == SDPBackend.FLASH_ATTENTION
+            or backend == SDPBackend.CUDNN_ATTENTION
+            else torch.float32
+        )
+
+        qkv = [
+            torch.rand(
+                (batch, nheads, qkv_len, dim),
+                dtype=dtype,
+                requires_grad=True,
+                device='cuda',
+            )
+            for _ in range(3)
+        ]
+        # specify the SDPBackend to use
+        with sdpa_kernel(backend):
+            out = F.scaled_dot_product_attention(*qkv, is_causal=True)
+
+
+    if __name__ == "__main__":
+        sdpa_example()
+
+
+Enable Context Parallel
+-----------------------
+
+Now, let's first adapt it to a distributed program where each rank has the same tensor input. Then we apply the context parallel API to
+shard to input and distribute the computation across ranks:
+
+.. code:: python
+
+    # file: cp_sdpa_example.py
+    import os
+
+    import torch
+    import torch.distributed as dist
+    import torch.nn.functional as F
+    from torch.distributed.device_mesh import init_device_mesh
+    from torch.distributed.tensor.experimental import context_parallel
+    from torch.distributed.tensor.experimental._attention import context_parallel_unshard
+    from torch.nn.attention import sdpa_kernel, SDPBackend
+
+
+    def context_parallel_sdpa_example(world_size: int, rank: int):
+        assert torch.cuda.is_available()
+        assert dist.is_nccl_available()
+        torch.cuda.set_device(f"cuda:{rank}")
+        torch.cuda.manual_seed(0)
+
+        dist.init_process_group(
+            backend="nccl",
+            init_method="env://",
+            world_size=world_size,
+            rank=rank,
+        )
+        device_mesh = init_device_mesh(
+            device_type="cuda", mesh_shape=(world_size,), mesh_dim_names=("cp",)
+        )
+
+        batch = 8
+        nheads = 8
+        qkv_len = 64
+        dim = 32
+        backend = SDPBackend.FLASH_ATTENTION
+        dtype = (
+            torch.bfloat16
+            if backend == SDPBackend.FLASH_ATTENTION
+            or backend == SDPBackend.CUDNN_ATTENTION
+            else torch.float32
+        )
+
+        qkv = [
+            torch.rand(
+                (batch, nheads, qkv_len, dim),
+                dtype=dtype,
+                requires_grad=True,
+                device='cuda',
+            )
+            for _ in range(3)
+        ]
+        # specify the SDPBackend to use
+        with sdpa_kernel(backend):
+            out = F.scaled_dot_product_attention(*qkv, is_causal=True)
+
+        # make a clean copy of QKV for output comparison
+        cp_qkv = [t.detach().clone() for t in qkv]
+
+        with sdpa_kernel(backend):
+            # This `context_parallel()` performs two actions:
+            # 1. Shard the tensor objects in `buffers` in-place along the dimension
+            #    specified in `buffer_seq_dims`, the tensors in `buffers` and their
+            #    sharding dims in `buffer_seq_dims` are organized in the same order.
+            # 2. Replace the execution of `F.scaled_dot_product_attention` with a
+            #    context-paralleled-enabled Ring Attention.
+            with context_parallel(
+                device_mesh, buffers=tuple(cp_qkv), buffer_seq_dims=(2, 2, 2)
+            ):
+                cp_out = F.scaled_dot_product_attention(*cp_qkv, is_causal=True)
+
+            # The output `cp_out` is still sharded in the same way as QKV
+            # the `context_parallel_unshard` API allows users to easily
+            # unshard to gain the full tensor.
+            (cp_out,) = context_parallel_unshard(device_mesh, [cp_out], [2])
+
+        assert torch.allclose(
+            cp_out,
+            out,
+            atol=(1e-08 if dtype == torch.float32 else 1e-03 * world_size),
+        )
+
+
+    if __name__ == "__main__":
+        rank = int(os.environ["RANK"])
+        world_size = int(os.environ["WORLD_SIZE"])
+
+        try:
+            context_parallel_sdpa_example(world_size, rank)
+        finally:
+            dist.barrier()
+            dist.destroy_process_group()
+
+
+You can use the command ``torchrun --standalone --nnodes=1 --nproc-per-node=4 cp_sdpa_example.py`` to launch the above context parallel
+SDPA on 4 GPUs. We demonstrate the numeric correctness by comparing the output of Ring Attention to that of SDPA on a single GPU.
+
+
+Select Rotation Approach
+------------------------
+
+You can choose the desired shards rotation approach in Ring Attention by using ``torch.distributed.tensor.experimental._attention.set_rotate_method()``:
+
+.. code:: python
+
+    # file: cp_sdpa_example.py
+    from torch.distributed.tensor.experimental._attention import set_rotate_method
+
+    set_rotate_method("alltoall")  # rotate shards using all-to-all
+
+    with sdpa_kernel(backend):
+        with context_parallel(
+            device_mesh, buffers=tuple(cp_qkv), buffer_seq_dims=(2, 2, 2)
+        ):
+            cp_out = F.scaled_dot_product_attention(*cp_qkv, is_causal=True)
+
+
+The default rotation approach is the all-gather based pass-KV.
+
+
+Conclusion
+----------
+
+In this tutorial, we have learned how to parallelize the SDPA computation along the sequence dimension easily with our Context Parallel APIs. For
+design and implementation details, performance analysis, and an end-to-end training example in `TorchTitan <https://github.com/pytorch/torchtitan>`__,
+see our post on `PyTorch native long-context training <https://discuss.pytorch.org/t/distributed-w-torchtitan-breaking-barriers-training-long-context-llms-with-1m-sequence-length-in-pytorch-using-context-parallel/215082>`__.
diff --git a/unstable_source/distributed_rpc_profiling.rst b/unstable_source/distributed_rpc_profiling.rst
new file mode 100644
index 00000000000..ab2ecbb2d9b
--- /dev/null
+++ b/unstable_source/distributed_rpc_profiling.rst
@@ -0,0 +1,10 @@
+Profiling PyTorch RPC-Based Workloads
+======================================
+
+This tutorial has been deprecated.
+
+Redirecting to homepage...
+
+.. raw:: html
+
+   <meta http-equiv="Refresh" content="2; url='https://pytorch.org/tutorials'" />
diff --git a/unstable_source/flight_recorder_tutorial.rst b/unstable_source/flight_recorder_tutorial.rst
new file mode 100644
index 00000000000..35477779083
--- /dev/null
+++ b/unstable_source/flight_recorder_tutorial.rst
@@ -0,0 +1,304 @@
+Flight Recorder for Debugging Stuck Jobs
+====================================================
+**Author**: `Chirag Pandya <https://github.com/c-p-i-o>`_, `Junjie Wang <https://github.com/fduwjj>`_
+
+What you will learn
+-------------------
+* Learn about a new tool for debugging stuck jobs during distributed training.
+* Learn how you can enable the tool and use the collected data for analyzing stuck jobs.
+
+Prerequisites
+-------------
+
+- PyTorch version 2.5 or later.
+- `tabulate <https://pypi.org/project/tabulate/>`__. You can install by running ``pip install tabulate``.
+
+
+Overview
+--------
+An AI distributed training job refers to the process of training a machine learning model using multiple devices, such
+as GPUs or CPUs, connected in a network. This approach allows for faster and more efficient training of large models
+that require significant computational resources.
+An engineer’s goal is to complete an AI training job as quickly as possible and make continuous improvements so that
+subsequent training can be done faster. A trained, usable model is the final desired outcome.
+One of the biggest impediment to completing training is the concept of a *stuck job*.
+
+A distributed AI training job is considered `stuck` when it stops making meaningful progress for an extended period of
+time.
+
+A job can get stuck for various reasons:
+
+- **Data Starvation:** This occurs when the training job is not receiving data at the expected rate, possibly due to issues with the data pipeline or the data source.
+
+- **Resource Constraints:** If the system running the job does not have enough computational resources (such as CPU, GPU, or memory), the job might not be able to proceed.
+
+- **Network Issues:** In a distributed training setup, different parts of the model or data may be processed on different devices. If there are network issues, communication between these devices may be disrupted, causing the job to get stuck.
+
+- **Software Bugs or Errors:** Errors in the training code or the underlying libraries and frameworks can also cause a job to get stuck.
+
+- **Synchronization Issues:** In distributed training, different parts of the computation are often run in parallel and need to be synchronized at certain points. If this synchronization fails, the job can get stuck. For example, a deadlock can occur if one or more ranks fail to join a collective while the remaining ranks have joined. This results in an indefinite wait for the job to progress.
+
+Flight Recorder, as the name suggests, captures diagnostics information as collectives run. The captured diagnostic
+information is used to help identify the root causes of issues when jobs become stuck.
+Flight Recorder consists of two core parts:
+
+- The collection portion: when enabled, information about collectives is recorded in an in-memory circular buffer. Upon job timeout, or on demand, the in-memory buffer can be retrieved or dumped to file.
+
+- An analyzer script is available in the `tools/flight_recorder <https://github.com/pytorch/pytorch/tree/main/tools/flight_recorder>`__ directory (details below).
+   The analyzer script runs known heuristics using the collected data and attempts to automatically identify the underlying issue that caused the job to stall.
+
+Enabling Flight Recorder
+------------------------
+There are three required environment variables to get the initial version of Flight Recorder working.
+
+- ``TORCH_NCCL_TRACE_BUFFER_SIZE = (0, N)``: Setting ``N`` to a positive number enables collection.
+  ``N`` represents the number of entries that will be kept internally in a circular buffer.
+  We recommended to set this value at *2000*. The default value is ``2000``.
+- ``TORCH_NCCL_DUMP_ON_TIMEOUT = (true, false)``: Setting this to ``true`` will write out diagnostic files to disk on job timeout.
+  If enabled, there will be one file per rank output in the job's running directory. The default value is ``false``.
+- ``TORCH_FR_DUMP_TEMP_FILE``: Setting the path where the flight recorder will be dumped with file prefix. One file per
+  rank. The default value is ``/tmp/nccl_trace_rank_``.
+
+**Optional settings:**
+
+- ``TORCH_NCCL_TRACE_CPP_STACK = (true, false)``: Setting this to true enables C++ stack traces to be captured in Flight Recorder.
+  C++ stack traces can be useful in providing the exact code path from a PyTorch Python call down to the primitive
+  C++ implementation. Also see ``TORCH_SYMBOLIZE_MODE`` in additional settings.
+- ``TORCH_NCCL_ENABLE_TIMING = (true, false)``: Setting this to ``true`` will enable additional cuda events at the start of each collective and
+  records the *duration* of each collective. This may incur some CPU overhead. In the collected data, the
+  *duration* field indicates how long each collective took to execute.
+
+Additional Settings
+-------------------
+
+- ``TORCH_SYMBOLIZE_MODE = (dladdr, addr2line, fast)``: This setting determines the program used to retrieve C++ traces from a running program.
+     The default setting is ``addr2line``.
+
+     ``fast`` is a new experimental mode that is shown to be much faster than the traditional ``addr2line``.
+     Use this setting in conjunction with ``TORCH_NCCL_TRACE_CPP_STACK`` to collect C++ traces in the Flight Recorder data.
+- If you prefer not to have the flight recorder data dumped into the local disk but rather onto your own storage, you can define your own writer class.
+  This class should inherit from class ``::c10d::DebugInfoWriter`` `(code) <https://github.com/pytorch/pytorch/blob/release/2.5/torch/csrc/distributed/c10d/NCCLUtils.hpp#L237>`__
+  and then register the new writer using ``::c10d::DebugInfoWriter::registerWriter`` `(code) <https://github.com/pytorch/pytorch/blob/release/2.5/torch/csrc/distributed/c10d/NCCLUtils.hpp#L242>`__
+  before we initiate PyTorch distributed.
+
+Retrieving Flight Recorder Data via an API
+------------------------------------------
+
+You can also retrieve Flight Recorder data with an API call.
+The API with the default arguments is shown below:
+
+.. code:: python
+
+  torch._C._distributed_c10d._dump_nccl_trace(includeCollectives=True, includeStackTraces=True, onlyActive=False)
+
+To view the data, you can ``unpickle`` it as shown below:
+
+.. code:: python
+
+  t = pickle.loads(torch._C._distributed_c10d._dump_nccl_trace())
+  print(t)
+
+Flight Recorder File Formats
+----------------------------
+
+Flight Recorder files are dumped in ``pickle`` format. Files are written to local disks or mounted shared NFS
+folders.
+
+The contents of a Flight Recorder ``unpickled`` file are shown below:
+
+.. code-block:: json
+
+  {
+    "version": "2.5",
+    "pg_config": {
+      "0": {
+      "name": "0",
+      "desc": "default_pg",
+      "ranks": "[0, 1]"
+      }
+    },
+    "pg_status": {
+      "0": {
+      "last_enqueued_collective": 2,
+      "last_started_collective": -1,
+      "last_completed_collective": 2
+      }
+    },
+    "entries": [
+    {
+      "frames": [
+      {
+      "name": "test_short_pickle",
+      "filename": "pytorch/test/distributed/test_c10d_nccl.py",
+      "line": 3647
+      },
+      {
+      "name": "spawn_main",
+      "filename": ".conda/envs/pytorch-3.10/lib/python3.10/multiprocessing/spawn.py",
+      "line": 116
+      },
+      {
+      "name": "<module>",
+      "filename": "<string>",
+      "line": 1
+      }
+      ],
+      "record_id": 0,
+      "pg_id": 0,
+      "process_group": ("0", "default_pg"),
+      "collective_seq_id": 1,
+      "p2p_seq_id": 0,
+      "op_id": 1,
+      "profiling_name": "nccl:all_reduce",
+      "time_created_ns": 1724779239936775119,
+      "input_sizes": [[3, 4]],
+      "input_dtypes": ["Float"],
+      "output_sizes": [[3, 4]],
+      "output_dtypes": ["Float"],
+      "state": "completed",
+      "time_discovered_started_ns": null,
+      "time_discovered_completed_ns": 1724779239975811724,
+      "retired": true,
+      "timeout_ms": 600000,
+      "is_p2p": false
+      },
+      ...
+      ]
+  }
+
+Analyzing Flight Recorder Dumps
+-------------------------------
+
+We have convenient scripts available in `pytorch/tools/flight_recorder` directory for analyzing captured
+data.
+
+To run the convenience script, follow these steps:
+
+1. Copy all files from a rank into a single directory.
+
+2. To run the script, use this command:
+
+.. code:: shell
+
+  python fr_trace.py <dump dir containing trace files> [-o <output file>]
+
+If you install the PyTorch nightly build or build from scratch with ``USE_DISTRIBUTED=1``, you can directly use the following
+command directly:
+
+.. code:: shell
+
+  torchfrtrace <dump dir containing trace files> [-o <output file>]
+
+
+Currently, we support two modes for the analyzer script. The first mode allows the script to apply some heuristics to the parsed flight
+recorder dumps to generate a report identifying potential culprits for the timeout. The second mode is simply outputs the raw dumps.
+By default, the script prints flight recoder dumps for all ranks and all ``ProcessGroups``(PGs). This can be narrowed down to certain
+ranks and PGs using the *--selected-ranks* argument for ranks and *--pg-filters* argument for PGs. An example command is:
+
+Caveat: tabulate module is needed, so you might need pip install it first.
+
+.. code:: shell
+
+  python fr_trace.py <dump dir containing trace files> -j [--selected-ranks i j k ...] [--pg-filters tp dp]
+  torchfrtrace <dump dir containing trace files> -j [--selected-ranks i j k ...] [--pg-filters 0 2]
+
+An End-to-End Example
+------------------------------------
+To demonstrate the use of Flight Recorder, we will use a small program where we induce mismatched collectives.
+In this example, ``rank0`` is programmed to do an additional collective.
+The Flight Recorder dump files are saved to the ``/tmp`` directory.
+For demonstration purposes, we named this program ``crash.py``.
+
+.. note::
+   Please note that this is a simplified example. In real-world scenarios, the process would involve more
+   complexities.
+
+.. code:: python
+
+  import torch
+  import torch.distributed as dist
+  import os
+  from datetime import timedelta
+
+  local_rank = int(os.environ["LOCAL_RANK"])
+  world_size = int(os.environ["WORLD_SIZE"])
+  assert world_size <= 8, "world size must be less than or equal to 8"
+  os.environ["TORCH_NCCL_DEBUG_INFO_TEMP_FILE"] = "/tmp/trace_"
+  os.environ["TORCH_NCCL_DUMP_ON_TIMEOUT"] = "1"
+  os.environ["TORCH_NCCL_TRACE_BUFFER_SIZE"] = "2000"
+  device = torch.device(f"cuda:{local_rank}")
+  print(f"{local_rank=} {world_size=} master addr: {os.environ['MASTER_ADDR']} master port: {os.environ['MASTER_PORT']} {device=}")
+
+  # Initialize the process group with a small timeout so that jobs fail quickly
+  dist.init_process_group("nccl", world_size=world_size, rank=local_rank, timeout=timedelta(seconds=1))
+
+  a = torch.full((3, 4), float(local_rank), device=device)
+  # Write some collectives to populate Flight Recorder data
+  for i in range(2):
+    print(f"calling allreduce on {local_rank=}")
+    f = dist.all_reduce(a)
+
+  # rank0 is doing an additional collective
+  if local_rank == 0:
+    print("rank0 is doing an allreduce on tensor b, but other ranks forgot")
+    b = torch.full((4,5), float(local_rank), device=device)
+    f = dist.all_reduce(b)
+
+  for i in range(2):
+    print(f"calling allreduce on {local_rank=}")
+    f = dist.all_reduce(a)
+
+  torch.cuda.synchronize(device=device)
+  print(f"{local_rank=} exiting")
+
+
+To run this program, use ``torchrun``:
+
+
+.. code:: python
+
+  torchrun --nnodes=1 --nproc_per_node=2 crash.py
+
+You should see two files in the ``/tmp`` directory:
+
+.. code:: bash
+
+  $ls /tmp/trace*
+  # Expected output
+  /tmp/trace_0 /tmp/trace_1
+
+Finally, to analyze these two files, we use the ``torchfrtrace`` command:
+
+.. code:: bash
+
+  torchfrtrace --prefix "trace_" /tmp/
+
+The output from the trace command is meant to be human-readable. It includes information about the
+set of collectives that caused a failure.
+The output for the command above is shown below.
+We can clearly see that rank 1 did not join the "all_reduce" collective.
+
+.. code-block:: bash
+  $torchfrtrace --prefix "trace_" /tmp/
+  Not all ranks joining collective 5 at entry 4
+  group info: 0:default_pg
+  collective: nccl:all_reduce
+  missing ranks: {1}
+  input sizes: [[3, 4]]
+  output sizes: [[3, 4]]
+  expected ranks: 2
+  collective state: scheduled
+  collective stack trace:
+    all_reduce at /home/cpio/local/pytorch/torch/distributed/distributed_c10d.py:2696
+    wrapper at /home/cpio/local/pytorch/torch/distributed/c10d_logger.py:83
+    <module> at /home/cpio/test/crash.py:44
+
+
+
+Conclusion
+----------
+In this tutorial, we have learned about a new PyTorch diagnostic tool called Flight Recorder.
+We have discussed how to enable Flight Recorder to collect diagnostic data from a machine.
+Additionally, we explored how to analyze the data captured from the Flight Recorder using a
+convenience script located in the `tools/flight_recorder <https://github.com/pytorch/pytorch/tree/main/tools/flight_recorder>`__
+directory of the PyTorch repository.
diff --git a/unstable_source/gpu_direct_storage.py b/unstable_source/gpu_direct_storage.py
new file mode 100644
index 00000000000..2b06c53bc7f
--- /dev/null
+++ b/unstable_source/gpu_direct_storage.py
@@ -0,0 +1,132 @@
+"""
+(prototype) Accelerating ``torch.save`` and ``torch.load`` with GPUDirect Storage
+=================================================================================
+
+GPUDirect Storage enables a direct data path for direct memory access transfers
+between GPU memory and storage, avoiding a bounce buffer through the CPU.
+
+In version **2.7**, we introduced new prototype APIs to ``torch.cuda.gds`` that serve as thin wrappers around
+the `cuFile APIs <https://docs.nvidia.com/gpudirect-storage/api-reference-guide/index.html#cufile-io-api>`_
+that can be used with ``torch.Tensor`` to achieve improved I/O performance.
+
+In this tutorial, we will demonstrate how to use the ``torch.cuda.gds`` APIs in conjunction with
+checkpoints generated by ``torch.save`` and ``torch.load`` on local filesystem. 
+
+.. grid:: 2
+
+    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn
+       :class-card: card-prerequisites
+
+       * Understand how to use the ``torch.cuda.gds`` APIs in conjunction with
+         checkpoints generated by ``torch.save`` and ``torch.load`` on local filesystem
+    
+    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites
+       :class-card: card-prerequisites
+
+       * PyTorch v.2.7.0 or later
+       * GPUDirect Storage must be installed per
+         `the documentation <https://docs.nvidia.com/gpudirect-storage/troubleshooting-guide/contents.html>`_
+       * Ensure that the filesystem that you are saving/loading to supports GPUDirect Storage.
+"""
+
+################################################################################
+# Using GPUDirect Storage with ``torch.save`` and ``torch.load``
+# ------------------------------------------------------------------------------------
+# GPUDirect Storage requires a storage alignment of 4KB. You can toggle this by using
+# ``torch.utils.serialization.config.save.storage_alignment``:
+
+import torch
+from torch.utils.serialization import config as serialization_config
+
+serialization_config.save.storage_alignment = 4096
+
+################################################################################
+# The steps involved in the process are as follows:
+#    * Write the checkpoint file without any actual data. This reserves the space on disk.
+#    * Read the offsets for the storage associated with each tensor in the checkpoint using ``FakeTensor``.
+#    * Use ``GDSFile`` to write the appropriate data at these offsets.
+# 
+# Given a state dictionary of tensors that are on the GPU, one can use the ``torch.serialization.skip_data`` context
+# manager to save a checkpoint that contains all relevant metadata except the storage bytes. For each ``torch.Storage``
+# in the state dictionary, space will be reserved within the checkpoint for the storage bytes.
+
+import torch.nn as nn
+
+m = nn.Linear(5, 10, device='cuda')
+sd = m.state_dict()
+
+with torch.serialization.skip_data():
+    torch.save(sd, "checkpoint.pt")
+
+################################################################################
+# We can get the offsets that each storage should be written to within the checkpoint by loading under
+# a ``FakeTensorMode``. A FakeTensor is a tensor that has metadata (such as sizes, strides, dtype, device)
+# information about the tensor but does not have any storage bytes. The following snippet will not materialize
+# any data but will tag each ``FakeTensor`` with the offset within the checkpoint that
+# corresponds to the tensor.
+# 
+# If you are continuously saving the same state dictionary during training, you
+# would only need to obtain the offsets once and the same offsets can be re-used. Similarly if tensor is going to
+# be saved or loaded to repeatedly you can use the ``torch.cuda.gds.gds_register_buffer`` which wraps
+# ``cuFileBufRegister`` to register the storages as GDS buffers.
+#
+# Note that ``torch.cuda.gds.GdsFile.save_storage`` binds to the synchronous ``cuFileWrite`` API,
+# so no synchronization is needed afterwards.
+
+
+import os
+from torch._subclasses.fake_tensor import FakeTensorMode
+
+with FakeTensorMode() as mode:
+    fake_sd = torch.load("checkpoint.pt")
+
+for k, v in fake_sd.items():
+    print(f"key={k}, offset={v.untyped_storage()._checkpoint_offset}")
+
+f = torch.cuda.gds.GdsFile("checkpoint.pt", os.O_RDWR)
+
+for k, v in sd.items():
+    offset = fake_sd[k].untyped_storage()._checkpoint_offset
+    # save_storage is a wrapper around `cuFileWrite`
+    f.save_storage(v.untyped_storage(), offset)
+
+
+################################################################################
+# We verify correctness of the saved checkpoint by ``torch.load`` and comparing.
+
+sd_loaded = torch.load("checkpoint.pt")
+for k, v in sd_loaded.items():
+    assert torch.equal(v, sd[k])
+
+################################################################################
+# The loading flow is the inverse: you can use ``torch.load`` with the ``torch.serialization.skip_data`` context
+# manager to load everything except the storage bytes. This means that any tensors in the checkpoint will be
+# created but their storages will be empty (as if the tensors were created via ``torch.empty``).
+
+with torch.serialization.skip_data():
+    sd_loaded = torch.load("checkpoint.pt")
+
+################################################################################
+# We once again use the ``FakeTensorMode`` to get the checkpoint offsets and
+# ascertain that the loaded checkpoint is the same as the saved checkpoint.
+#
+# Similar to  ``torch.cuda.gds.GdsFile.save_storage``, ``torch.cuda.gds.GdsFile.load_storage``
+# binds to the synchronous ``cuFileRead`` API, so no synchronization is needed afterwards.
+
+for k, v in sd_loaded.items():
+    assert not torch.equal(v, sd[k])
+    offset = fake_sd[k].untyped_storage()._checkpoint_offset
+    # load_storage is a wrapper around `cuFileRead`
+    f.load_storage(v.untyped_storage(), offset)
+
+for k, v in sd_loaded.items():
+    assert torch.equal(v, sd[k])
+
+del f
+##########################################################
+# Conclusion
+# ==========
+#
+# In this tutorial we have demonstrated how to use the prototype ``torch.cuda.gds`` APIs
+# in conjunction with ``torch.save`` and ``torch.load`` on local filesystem. Please
+# file an issue in the PyTorch GitHub repo if you have any feedback.
diff --git a/unstable_source/gpu_quantization_torchao_tutorial.py b/unstable_source/gpu_quantization_torchao_tutorial.py
new file mode 100644
index 00000000000..874f3227636
--- /dev/null
+++ b/unstable_source/gpu_quantization_torchao_tutorial.py
@@ -0,0 +1,320 @@
+"""
+(prototype) GPU Quantization with TorchAO
+======================================================
+
+**Author**: `HDCharles <https://github.com/HDCharles>`_
+
+In this tutorial, we will walk you through the quantization and optimization
+of the popular `segment anything model <https://github.com/facebookresearch/segment-anything>`_. These
+steps will mimic some of those taken to develop the
+`segment-anything-fast <https://github.com/meta-pytorch/segment-anything-fast/blob/main/segment_anything_fast/modeling/image_encoder.py#L15>`_
+repo. This step-by-step guide demonstrates how you can
+apply these techniques to speed up your own models, especially those
+that use transformers. To that end, we will focus on widely applicable
+techniques, such as optimizing performance with ``torch.compile`` and
+quantization and measure their impact.
+
+"""
+
+
+######################################################################
+# Set up Your Environment
+# --------------------------------
+#
+# First, let's configure your environment. This guide was written for CUDA 12.1.
+# We have run this tutorial on an A100-PG509-200 power limited to 330.00 W. If you
+# are using a different hardware, you might see different performance numbers.
+#
+#
+# .. code-block:: bash
+#
+#    > conda create -n myenv python=3.10
+#    > pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu121
+#    > pip install git+https://github.com/facebookresearch/segment-anything.git
+#    > pip install git+https://github.com/pytorch/ao.git
+#
+# Segment Anything Model checkpoint setup:
+#
+# 1. Go to the `segment-anything repo checkpoint <https://github.com/facebookresearch/segment-anything/tree/main#model-checkpoints>`_ and download the ``vit_h`` checkpoint. Alternatively, you can use ``wget`` (for example, ``wget https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth --directory-prefix=<path>``).
+# 2. Pass in that directory by editing the code below to say:
+#
+# .. code-block:: bash
+#
+#   {sam_checkpoint_base_path}=<path>
+#
+
+import torch
+from torchao.quantization.quant_api import quantize_, Int8DynamicActivationInt8WeightConfig
+from torchao.utils import unwrap_tensor_subclass, TORCH_VERSION_AT_LEAST_2_5
+from segment_anything import sam_model_registry
+from torch.utils.benchmark import Timer
+
+sam_checkpoint_base_path = "data"
+model_type = 'vit_h'
+model_name = 'sam_vit_h_4b8939.pth'
+checkpoint_path = f"{sam_checkpoint_base_path}/{model_name}"
+batchsize = 16
+only_one_block = True
+
+
+@torch.no_grad()
+def benchmark(f, *args, **kwargs):
+    for _ in range(3):
+        f(*args, **kwargs)
+        torch.cuda.synchronize()
+
+    torch.cuda.reset_peak_memory_stats()
+    t0 = Timer(
+        stmt="f(*args, **kwargs)", globals={"args": args, "kwargs": kwargs, "f": f}
+    )
+    res = t0.adaptive_autorange(.03, min_run_time=.2, max_run_time=20)
+    return {'time':res.median * 1e3, 'memory': torch.cuda.max_memory_allocated()/1e9}
+
+def get_sam_model(only_one_block=False, batchsize=1):
+    sam = sam_model_registry[model_type](checkpoint=checkpoint_path).cuda()
+    model = sam.image_encoder.eval()
+    image = torch.randn(batchsize, 3, 1024, 1024, device='cuda')
+
+    # code to use just a single block of the model
+    if only_one_block:
+        model = model.blocks[0]
+        image = torch.randn(batchsize, 64, 64, 1280, device='cuda')
+    return model, image
+
+
+######################################################################
+# In this tutorial, we focus on quantizing the ``image_encoder`` because the
+# inputs to it are statically sized while the prompt encoder and mask
+# decoder have variable sizes which makes them harder to quantize.
+#
+# We’ll focus on just a single block at first to make the analysis easier.
+#
+# Let's start by measuring the baseline runtime.
+
+try:
+    model, image = get_sam_model(only_one_block, batchsize)
+    fp32_res = benchmark(model, image)
+    print(f"base fp32 runtime of the model is {fp32_res['time']:0.2f}ms and peak memory {fp32_res['memory']:0.2f}GB")
+    # base fp32 runtime of the model is 186.16ms and peak memory 6.33GB
+except Exception as e:
+    print("unable to run fp32 model: ", e)
+
+
+
+######################################################################
+# We can achieve an instant performance boost by converting the model to bfloat16.
+# The reason we opt for bfloat16 over fp16 is due to its dynamic range, which is comparable to
+# that of fp32. Both bfloat16 and fp32 possess 8 exponential bits, whereas fp16 only has 4. This
+# larger dynamic range helps protect us from overflow errors and other issues that can arise
+# when scaling and rescaling tensors due to quantization.
+#
+
+model, image = get_sam_model(only_one_block, batchsize)
+model = model.to(torch.bfloat16)
+image = image.to(torch.bfloat16)
+bf16_res = benchmark(model, image)
+print(f"bf16 runtime of the block is {bf16_res['time']:0.2f}ms and peak memory {bf16_res['memory']: 0.2f}GB")
+# bf16 runtime of the block is 25.43ms and peak memory  3.17GB
+
+
+######################################################################
+# Just this quick change improves runtime by a factor of ~7x in the tests we have
+# conducted (186.16ms to 25.43ms).
+#
+# Next, let's use ``torch.compile`` with our model to see how much the performance
+# improves.
+#
+
+model_c = torch.compile(model, mode='max-autotune')
+comp_res = benchmark(model_c, image)
+print(f"bf16 compiled runtime of the block is {comp_res['time']:0.2f}ms and peak memory {comp_res['memory']: 0.2f}GB")
+# bf16 compiled runtime of the block is 19.95ms and peak memory  2.24GB
+
+
+######################################################################
+# The first time this is run, you should see a sequence of ``AUTOTUNE``
+# outputs which occurs when inductor compares the performance between
+# various kernel parameters for a kernel. This only happens once (unless
+# you delete your cache) so if you run the cell again you should just get
+# the benchmark output.
+#
+# ``torch.compile`` yields about another 27% improvement. This brings the
+# model to a reasonable baseline where we now have to work a bit harder
+# for improvements.
+#
+# Next, let's apply quantization. Quantization for GPUs comes in three main forms
+# in `torchao <https://github.com/pytorch/ao>`_ which is just native
+# pytorch+python code. This includes:
+#
+# * int8 dynamic quantization
+# * int8 weight-only quantization
+# * int4 weight-only quantization
+#
+# Different models, or sometimes different layers in a model can require different techniques.
+# For models which are heavily compute bound, dynamic quantization tends
+# to work the best since it swaps the normal expensive floating point
+# matmul ops with integer versions. Weight-only quantization works better
+# in memory bound situations where the benefit comes from loading less
+# weight data, rather than doing less computation. The torchao APIs:
+#
+# ``Int8DynamicActivationInt8WeightConfig()``,
+# ``Int8WeightOnlyConfig()`` or
+# ``Int4WeightOnlyConfig()``
+#
+# can be used to easily apply the desired quantization technique and then
+# once the model is compiled with ``torch.compile`` with ``max-autotune``, quantization is
+# complete and we can see our speedup.
+#
+# .. note::
+#    You might experience issues with these on older versions of PyTorch. If you run
+#    into an issue, you can use ``apply_dynamic_quant`` and
+#    ``apply_weight_only_int8_quant`` instead as drop in replacement for the two
+#    above (no replacement for int4).
+#
+# The difference between the two APIs is that the ``Int8DynamicActivationInt8WeightConfig`` API
+# alters the weight tensor of the linear module so instead of doing a
+# normal linear, it does a quantized operation. This is helpful when you
+# have non-standard linear ops that do more than one thing. The ``apply``
+# APIs directly swap the linear modules for a quantized module which
+# works on older versions but doesn’t work with non-standard linear
+# modules.
+#
+# In this case Segment Anything is compute-bound so we’ll use dynamic quantization:
+#
+
+del model_c, model, image
+model, image = get_sam_model(only_one_block, batchsize)
+model = model.to(torch.bfloat16)
+image = image.to(torch.bfloat16)
+quantize_(model, Int8DynamicActivationInt8WeightConfig())
+if not TORCH_VERSION_AT_LEAST_2_5:
+    # needed for subclass + compile to work on older versions of pytorch
+    unwrap_tensor_subclass(model)
+model_c = torch.compile(model, mode='max-autotune')
+quant_res = benchmark(model_c, image)
+print(f"bf16 compiled runtime of the quantized block is {quant_res['time']:0.2f}ms and peak memory {quant_res['memory']: 0.2f}GB")
+# bf16 compiled runtime of the quantized block is 19.04ms and peak memory  3.58GB
+
+
+######################################################################
+# With quantization, we have improved performance a bit more but memory usage increased
+# significantly.
+#
+# This is for two reasons:
+#
+# 1) Quantization adds overhead to the model
+#    since we need to quantize and dequantize the input and output. For small
+#    batch sizes this overhead can actually make the model go slower.
+# 2) Even though we are doing a quantized matmul, such as ``int8 x int8``,
+#    the result of the multiplication gets stored in an int32 tensor
+#    which is twice the size of the result from the non-quantized model.
+#    If we can avoid creating this int32 tensor, our memory usage will improve a lot.
+#
+# We can fix #2 by fusing the integer matmul with the subsequent rescale
+# operation since the final output will be bf16, if we immediately convert
+# the int32 tensor to bf16 and instead store that we’ll get better
+# performance in terms of both runtime and memory.
+#
+# The way to do this, is to enable the option
+# ``force_fuse_int_mm_with_mul`` in the inductor config.
+#
+
+del model_c, model, image
+model, image = get_sam_model(only_one_block, batchsize)
+model = model.to(torch.bfloat16)
+image = image.to(torch.bfloat16)
+torch._inductor.config.force_fuse_int_mm_with_mul = True
+quantize_(model, Int8DynamicActivationInt8WeightConfig())
+if not TORCH_VERSION_AT_LEAST_2_5:
+    # needed for subclass + compile to work on older versions of pytorch
+    unwrap_tensor_subclass(model)
+model_c = torch.compile(model, mode='max-autotune')
+quant_res = benchmark(model_c, image)
+print(f"bf16 compiled runtime of the fused quantized block is {quant_res['time']:0.2f}ms and peak memory {quant_res['memory']: 0.2f}GB")
+# bf16 compiled runtime of the fused quantized block is 18.78ms and peak memory  2.37GB
+
+
+######################################################################
+# The fusion improves performance by another small bit (about 6% over the
+# baseline in total) and removes almost all the memory increase, the
+# remaining amount (2.37GB quantized vs 2.24GB unquantized) is due to
+# quantization overhead which cannot be helped.
+#
+# We’re still not done though, we can apply a few general purpose
+# optimizations to get our final best-case performance.
+#
+# 1) We can sometimes improve performance by disabling epilogue fusion
+#    since the autotuning process can be confused by fusions and choose
+#    bad kernel parameters.
+# 2) We can apply coordinate descent tuning in all directions to enlarge
+#    the search area for kernel parameters.
+#
+
+del model_c, model, image
+model, image = get_sam_model(only_one_block, batchsize)
+model = model.to(torch.bfloat16)
+image = image.to(torch.bfloat16)
+torch._inductor.config.epilogue_fusion = False
+torch._inductor.config.coordinate_descent_tuning = True
+torch._inductor.config.coordinate_descent_check_all_directions = True
+torch._inductor.config.force_fuse_int_mm_with_mul = True
+quantize_(model, Int8DynamicActivationInt8WeightConfig())
+if not TORCH_VERSION_AT_LEAST_2_5:
+    # needed for subclass + compile to work on older versions of pytorch
+    unwrap_tensor_subclass(model)
+model_c = torch.compile(model, mode='max-autotune')
+quant_res = benchmark(model_c, image)
+print(f"bf16 compiled runtime of the final quantized block is {quant_res['time']:0.2f}ms and peak memory {quant_res['memory']: 0.2f}GB")
+# bf16 compiled runtime of the final quantized block is 18.16ms and peak memory  2.39GB
+
+
+######################################################################
+# As you can see, we’ve squeezed another small improvement from the model,
+# taking our total improvement to over 10x compared to our original. To
+# get a final estimate of the impact of quantization lets do an apples to
+# apples comparison on the full model since the actual improvement will
+# differ block by block depending on the shapes involved.
+#
+
+try:
+    del model_c, model, image
+    model, image = get_sam_model(False, batchsize)
+    model = model.to(torch.bfloat16)
+    image = image.to(torch.bfloat16)
+    model_c = torch.compile(model, mode='max-autotune')
+    quant_res = benchmark(model_c, image)
+    print(f"bf16 compiled runtime of the compiled full model is {quant_res['time']:0.2f}ms and peak memory {quant_res['memory']: 0.2f}GB")
+    # bf16 compiled runtime of the compiled full model is 729.65ms and peak memory  23.96GB
+
+    del model_c, model, image
+    model, image = get_sam_model(False, batchsize)
+    model = model.to(torch.bfloat16)
+    image = image.to(torch.bfloat16)
+    quantize_(model, Int8DynamicActivationInt8WeightConfig())
+    if not TORCH_VERSION_AT_LEAST_2_5:
+        # needed for subclass + compile to work on older versions of pytorch
+        unwrap_tensor_subclass(model)
+    model_c = torch.compile(model, mode='max-autotune')
+    quant_res = benchmark(model_c, image)
+    print(f"bf16 compiled runtime of the quantized full model is {quant_res['time']:0.2f}ms and peak memory {quant_res['memory']: 0.2f}GB")
+    # bf16 compiled runtime of the quantized full model is 677.28ms and peak memory  24.93GB
+except Exception as e:
+    print("unable to run full model: ", e)
+
+
+
+######################################################################
+# Conclusion
+# -----------------
+# In this tutorial, we have learned about the quantization and optimization techniques
+# on the example of the segment anything model.
+#
+# In the end, we achieved a full-model apples to apples quantization speedup
+# of about 7.7% on batch size 16 (677.28ms to 729.65ms). We can push this a
+# bit further by increasing the batch size and optimizing other parts of
+# the model. For example, this can be done with some form of flash attention.
+#
+# For more information visit
+# `torchao <https://github.com/pytorch/ao>`_ and try it on your own
+# models.
+#
diff --git a/unstable_source/inductor_cpp_wrapper_tutorial.rst b/unstable_source/inductor_cpp_wrapper_tutorial.rst
new file mode 100644
index 00000000000..4bcc9009075
--- /dev/null
+++ b/unstable_source/inductor_cpp_wrapper_tutorial.rst
@@ -0,0 +1,159 @@
+Inductor C++ Wrapper Tutorial
+==============================================================
+
+**Author**: `Chunyuan Wu <https://github.com/chunyuan-w>`_, `Bin Bao <https://github.com/desertfire>`__, `Jiong Gong <https://github.com/jgong5>`__
+
+Prerequisites:
+----------------
+-  `torch.compile and TorchInductor concepts in PyTorch <https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html>`__
+
+Introduction
+------------
+
+Python, as the primary interface of PyTorch, is easy to use and efficient for development and debugging. 
+The Inductor's default wrapper generates Python code to invoke generated kernels and external kernels.
+However, in deployments requiring high performance, Python, as an interpreted language, runs relatively slower compared to compiled languages.
+
+We implemented an Inductor C++ wrapper by leveraging the PyTorch C++ APIs
+to generate pure C++ code that combines the generated and external kernels.
+This allows for the execution of each captured Dynamo graph in pure C++,
+thereby reducing the Python overhead within the graph.
+
+
+Enabling the API
+----------------
+This feature is still in prototype stage. To activate this feature, add the following to your code:
+
+.. code:: python
+
+    import torch._inductor.config as config
+    config.cpp_wrapper = True
+
+This will speed up your models by reducing the Python overhead of the Inductor wrapper.
+
+
+Example code
+------------
+
+We will use the below frontend code as an example:
+
+.. code:: python
+    
+    import torch
+
+    def fn(x):
+        return torch.tensor(list(range(2, 40, 2)), device=x.device) + x
+
+    x = torch.randn(1)
+    opt_fn = torch.compile()(fn)
+    y = opt_fn(x)
+
+
+**For CPU**
+
+The main part of Inductor-generated code with the default Python wrapper will look like this:
+
+.. code:: python
+
+    def call(args):
+        arg0_1, = args
+        args.clear()
+        assert_size_stride(arg0_1, (1, ), (1, ))
+        buf0 = empty_strided((19, ), (1, ), device='cpu', dtype=torch.float32)
+        cpp_fused_add_lift_fresh_0(c_void_p(constant0.data_ptr()), c_void_p(arg0_1.data_ptr()), c_void_p(buf0.data_ptr()))
+        del arg0_1
+        return (buf0, )
+
+By turning on the C++ wrapper, the generated code for the ``call`` function becomes a C++ function
+``inductor_entry_cpp`` of the C++ extension ``module``:
+
+.. code:: python
+
+    std::vector<at::Tensor> inductor_entry_cpp(const std::vector<at::Tensor>& args) {
+        at::Tensor arg0_1 = args[0];
+        at::Tensor constant0 = args[1];
+        auto buf0 = at::empty_strided({19L, }, {1L, }, at::device(at::kCPU).dtype(at::kFloat));
+        cpp_fused_add_lift_fresh_0((long*)(constant0.data_ptr()), (float*)(arg0_1.data_ptr()), (float*)(buf0.data_ptr()));
+        arg0_1.reset();
+        return {buf0};
+    }
+
+    module = CppWrapperCodeCache.load(cpp_wrapper_src, 'inductor_entry_cpp', 'c2buojsvlqbywxe3itb43hldieh4jqulk72iswa2awalwev7hjn2', False)
+
+    def _wrap_func(f):
+        def g(args):
+            args_tensor = [arg if isinstance(arg, torch.Tensor) else torch.tensor(arg) for arg in args]
+            constants_tensor = [constant0]
+            args_tensor.extend(constants_tensor)                    
+
+            return f(args_tensor)
+        return g
+    call = _wrap_func(module.inductor_entry_cpp)
+
+**For GPU**
+
+Based on the same example code, the generated code for GPU will look like this:
+
+.. code:: python
+
+    def call(args):
+        arg0_1, = args
+        args.clear()
+        assert_size_stride(arg0_1, (1, ), (1, ))
+        with torch.cuda._DeviceGuard(0):
+            torch.cuda.set_device(0) # no-op to ensure context
+            buf0 = empty_strided((19, ), (1, ), device='cuda', dtype=torch.float32)
+            # Source Nodes: [add, tensor], Original ATen: [aten.add, aten.lift_fresh]
+            stream0 = get_cuda_stream(0)
+            triton_poi_fused_add_lift_fresh_0.run(constant0, arg0_1, buf0, 19, grid=grid(19), stream=stream0)
+            run_intermediate_hooks('add', buf0)
+            del arg0_1
+            return (buf0, )
+
+With the C++ wrapper turned on, the below equivalent C++ code will be generated:
+
+.. code:: python
+
+    std::vector<at::Tensor> inductor_entry_cpp(const std::vector<at::Tensor>& args) {
+        at::Tensor arg0_1 = args[0];
+        at::Tensor constant0 = args[1];
+
+        at::cuda::CUDAGuard device_guard(0);
+        auto buf0 = at::empty_strided({19L, }, {1L, }, at::TensorOptions(c10::Device(at::kCUDA, 0)).dtype(at::kFloat));
+        // Source Nodes: [add, tensor], Original ATen: [aten.add, aten.lift_fresh]
+        if (triton_poi_fused_add_lift_fresh_0 == nullptr) {
+            triton_poi_fused_add_lift_fresh_0 = loadKernel("/tmp/torchinductor_user/mm/cmm6xjgijjffxjku4akv55eyzibirvw6bti6uqmfnruujm5cvvmw.cubin", "triton_poi_fused_add_lift_fresh_0_0d1d2d3");
+        }
+        CUdeviceptr var_0 = reinterpret_cast<CUdeviceptr>(constant0.data_ptr());
+        CUdeviceptr var_1 = reinterpret_cast<CUdeviceptr>(arg0_1.data_ptr());
+        CUdeviceptr var_2 = reinterpret_cast<CUdeviceptr>(buf0.data_ptr());
+        auto var_3 = 19;
+        void* kernel_args_var_0[] = {&var_0, &var_1, &var_2, &var_3};
+        cudaStream_t stream0 = at::cuda::getCurrentCUDAStream(0);
+        launchKernel(triton_poi_fused_add_lift_fresh_0, 1, 1, 1, 1, 0, kernel_args_var_0, stream0);
+        arg0_1.reset();
+        return {buf0};
+    }
+
+    module = CppWrapperCodeCache.load(cpp_wrapper_src, 'inductor_entry_cpp', 'czbpeilh4qqmbyejdgsbpdfuk2ss5jigl2qjb7xs4gearrjvuwem', True)
+
+    def _wrap_func(f):
+        def g(args):
+            args_tensor = [arg if isinstance(arg, torch.Tensor) else torch.tensor(arg) for arg in args]
+            constants_tensor = [constant0]
+            args_tensor.extend(constants_tensor)
+
+            return f(args_tensor)
+        return g
+    call = _wrap_func(module.inductor_entry_cpp)
+
+
+Conclusion
+------------
+
+In this tutorial, we introduced a new C++ wrapper in TorchInductor to speed up your models with just two lines of code changes.
+We explained the motivation of this new feature and walked through the easy-to-use API to activate this experimental feature.
+Furthermore, we demonstrated the Inductor-generated code using the default Python wrapper and the new C++ wrapper on both CPU and GPU
+to visually showcase the difference between these two wrappers.
+
+This feature is still in prototype stage. If you have any feature requests or run into any issues, please file a bug report at `GitHub issues <https://github.com/pytorch/pytorch/issues>`_.
diff --git a/unstable_source/inductor_windows.rst b/unstable_source/inductor_windows.rst
new file mode 100644
index 00000000000..871cc48a33e
--- /dev/null
+++ b/unstable_source/inductor_windows.rst
@@ -0,0 +1,105 @@
+How to use ``torch.compile`` on Windows CPU/XPU
+===============================================
+
+**Author**: `Zhaoqiong Zheng <https://github.com/ZhaoqiongZ>`_, `Xu, Han <https://github.com/xuhancn>`_
+
+
+Introduction
+------------
+
+TorchInductor is the new compiler backend that compiles the FX Graphs generated by TorchDynamo into optimized C++/Triton kernels.
+
+This tutorial introduces the steps for using TorchInductor via ``torch.compile`` on Windows CPU/XPU.
+
+
+Software Installation
+---------------------
+
+Now, we will walk you through a step-by-step tutorial for how to use ``torch.compile`` on Windows CPU/XPU.
+
+Install a Compiler
+^^^^^^^^^^^^^^^^^^
+
+C++ compiler is required for TorchInductor optimization, let's take Microsoft Visual C++ (MSVC) as an example.
+
+#. Download and install `MSVC <https://visualstudio.microsoft.com/downloads/>`_.
+
+#. During Installation, select **Workloads** and then **Desktop & Mobile**. Select a checkmark on **Desktop Development with C++** and install.
+
+.. image:: ../_static/img/install_msvc.png
+
+
+.. note::
+
+    Windows CPU inductor also support C++ compiler `LLVM Compiler <https://github.com/llvm/llvm-project/releases>`_ and `Intel Compiler <https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compiler-download.html>`_ for better performance.
+    Please check `Alternative Compiler for better performance on CPU <#alternative-compiler-for-better-performance>`_.
+
+Set Up Environment
+^^^^^^^^^^^^^^^^^^
+Next, let's configure our environment.
+
+#. Open a command line environment via cmd.exe.
+#. Activate ``MSVC`` via below command::
+    
+    "C:/Program Files/Microsoft Visual Studio/2022/Community/VC/Auxiliary/Build/vcvars64.bat"
+#. Create and activate a virtual environment: ::
+
+#. Install `PyTorch 2.5 <https://pytorch.org/get-started/locally/>`_ or later for CPU Usage. Install PyTorch 2.7 or later refer to `Getting Started on Intel GPU <https://pytorch.org/docs/main/notes/get_start_xpu.html>`_ for XPU usage.
+
+#. Here is an example of how to use TorchInductor on Windows:
+
+    .. code-block:: python
+
+        import torch
+        device="cpu" # or "xpu" for XPU
+        def foo(x, y):
+            a = torch.sin(x)
+            b = torch.cos(x)
+            return a + b
+        opt_foo1 = torch.compile(foo)
+        print(opt_foo1(torch.randn(10, 10).to(device), torch.randn(10, 10).to(device)))
+
+#. Below is the output of the above example::
+
+    tensor([[-3.9074e-02,  1.3994e+00,  1.3894e+00,  3.2630e-01,  8.3060e-01,
+            1.1833e+00,  1.4016e+00,  7.1905e-01,  9.0637e-01, -1.3648e+00],
+            [ 1.3728e+00,  7.2863e-01,  8.6888e-01, -6.5442e-01,  5.6790e-01,
+            5.2025e-01, -1.2647e+00,  1.2684e+00, -1.2483e+00, -7.2845e-01],
+            [-6.7747e-01,  1.2028e+00,  1.1431e+00,  2.7196e-02,  5.5304e-01,
+            6.1945e-01,  4.6654e-01, -3.7376e-01,  9.3644e-01,  1.3600e+00],
+            [-1.0157e-01,  7.7200e-02,  1.0146e+00,  8.8175e-02, -1.4057e+00,
+            8.8119e-01,  6.2853e-01,  3.2773e-01,  8.5082e-01,  8.4615e-01],
+            [ 1.4140e+00,  1.2130e+00, -2.0762e-01,  3.3914e-01,  4.1122e-01,
+            8.6895e-01,  5.8852e-01,  9.3310e-01,  1.4101e+00,  9.8318e-01],
+            [ 1.2355e+00,  7.9290e-02,  1.3707e+00,  1.3754e+00,  1.3768e+00,
+            9.8970e-01,  1.1171e+00, -5.9944e-01,  1.2553e+00,  1.3394e+00],
+            [-1.3428e+00,  1.8400e-01,  1.1756e+00, -3.0654e-01,  9.7973e-01,
+            1.4019e+00,  1.1886e+00, -1.9194e-01,  1.3632e+00,  1.1811e+00],
+            [-7.1615e-01,  4.6622e-01,  1.2089e+00,  9.2011e-01,  1.0659e+00,
+            9.0892e-01,  1.1932e+00,  1.3888e+00,  1.3898e+00,  1.3218e+00],
+            [ 1.4139e+00, -1.4000e-01,  9.1192e-01,  3.0175e-01, -9.6432e-01,
+            -1.0498e+00,  1.4115e+00, -9.3212e-01, -9.0964e-01,  1.0127e+00],
+            [ 5.7244e-04,  1.2799e+00,  1.3595e+00,  1.0907e+00,  3.7191e-01,
+            1.4062e+00,  1.3672e+00,  6.8502e-02,  8.5216e-01,  8.6046e-01]])
+
+Alternative Compiler for better performance on CPU
+--------------------------------------------------
+
+To enhance performance for inductor on Windows CPU, you can use the Intel Compiler or LLVM Compiler. However, they rely on the runtime libraries from Microsoft Visual C++ (MSVC). Therefore, your first step should be to install MSVC.
+
+Intel Compiler
+^^^^^^^^^^^^^^
+
+#. Download and install `Intel Compiler <https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compiler-download.html>`_ with Windows version.
+#. Set Windows Inductor Compiler via environment variable ``set CXX=icx-cl``.
+
+LLVM Compiler
+^^^^^^^^^^^^^
+
+#. Download and install `LLVM Compiler <https://github.com/llvm/llvm-project/releases>`_ and choose win64 version.
+#. Set Windows Inductor Compiler via environment variable ``set CXX=clang-cl``.
+
+Conclusion
+----------
+
+In this tutorial, we introduce how to use Inductor on Windows CPU with PyTorch 2.5 or later, and on Windows XPU with PyTorch 2.7 or later. We can also use Intel Compiler or LLVM Compiler to get better performance on CPU.
diff --git a/unstable_source/inductor_windows_cpu.rst b/unstable_source/inductor_windows_cpu.rst
new file mode 100644
index 00000000000..24ce55a82f9
--- /dev/null
+++ b/unstable_source/inductor_windows_cpu.rst
@@ -0,0 +1,7 @@
+This tutorial has been moved to https://pytorch.org/tutorials/prototype/inductor_windows.html.
+
+Redirecting in 3 seconds...
+
+.. raw:: html
+
+   <meta http-equiv="Refresh" content="3; url='https://pytorch.org/tutorials/prototype/inductor_windows.html'" />
diff --git a/unstable_source/ios_coreml_workflow.rst b/unstable_source/ios_coreml_workflow.rst
new file mode 100644
index 00000000000..db9abcc5076
--- /dev/null
+++ b/unstable_source/ios_coreml_workflow.rst
@@ -0,0 +1,10 @@
+(Prototype) Convert Mobilenetv2 to Core ML
+==========================================
+
+PyTorch Mobile is no longer actively supported. Please check out `ExecuTorch <https://github.com/pytorch/executorch>`__.
+
+Redirecting in 3 seconds...
+
+.. raw:: html
+
+    <meta http-equiv="Refresh" content="3; url='https://pytorch.org/executorch/stable/index.html'" />
diff --git a/unstable_source/ios_gpu_workflow.rst b/unstable_source/ios_gpu_workflow.rst
new file mode 100644
index 00000000000..8915e1c4fad
--- /dev/null
+++ b/unstable_source/ios_gpu_workflow.rst
@@ -0,0 +1,10 @@
+(Prototype) Use iOS GPU in PyTorch
+==================================
+
+PyTorch Mobile is no longer actively supported. Please check out `ExecuTorch <https://github.com/pytorch/executorch>`__.
+
+Redirecting in 3 seconds...
+
+.. raw:: html
+
+    <meta http-equiv="Refresh" content="3; url='https://pytorch.org/executorch/stable/index.html'" />
diff --git a/unstable_source/lite_interpreter.rst b/unstable_source/lite_interpreter.rst
new file mode 100644
index 00000000000..73e950d72e2
--- /dev/null
+++ b/unstable_source/lite_interpreter.rst
@@ -0,0 +1,9 @@
+(Prototype) Introduce lite interpreter workflow in Android and iOS
+=======================
+
+This tutorial has been moved to https://pytorch.org/tutorials/recipes/mobile_interpreter.html
+
+
+.. raw:: html
+
+   <meta http-equiv="Refresh" content="0; url='https://pytorch.org/tutorials/recipes/mobile_interpreter.html'" />
diff --git a/unstable_source/maskedtensor_adagrad.py b/unstable_source/maskedtensor_adagrad.py
new file mode 100644
index 00000000000..d4eca31c5cb
--- /dev/null
+++ b/unstable_source/maskedtensor_adagrad.py
@@ -0,0 +1,218 @@
+# -*- coding: utf-8 -*-
+
+"""
+Efficiently writing "sparse" semantics for Adagrad with MaskedTensor
+================================================================================
+"""
+
+######################################################################
+# Before working through this tutorial, please review the MaskedTensor
+# `Overview <https://pytorch.org/tutorials/prototype/maskedtensor_overview.html>`__ and
+# `Sparsity <https://pytorch.org/tutorials/prototype/maskedtensor_sparsity.html>`__ tutorials.
+#
+# Introduction and Motivation
+# ---------------------------
+# `Issue 1369 <https://github.com/pytorch/pytorch/issues/1369>`__ discussed the additional lines of code
+# that were introduced while writing "sparse" semantics for Adagrad, but really,
+# the code uses sparsity as a proxy for masked semantics rather than the intended use case of sparsity:
+# a compression and optimization technique.
+# Previously, we worked around the lack of formal masked semantics by introducing one-off semantics and operators
+# while forcing users to be aware of storage details such as indices and values.
+#
+# Now that we have masked semantics, we are better equipped to point out when sparsity is used as a semantic extension.
+# We'll also compare and contrast this with equivalent code written using MaskedTensor.
+# In the end the code snippets are repeated without additional comments to show the difference in brevity.
+#
+# Preparation
+# -----------
+#
+
+import torch
+import warnings
+
+# Disable prototype warnings and such
+warnings.filterwarnings(action='ignore', category=UserWarning)
+
+# Some hyperparameters
+eps = 1e-10
+clr = 0.1
+
+i = torch.tensor([[0, 1, 1], [2, 0, 2]])
+v = torch.tensor([3, 4, 5], dtype=torch.float32)
+grad = torch.sparse_coo_tensor(i, v, [2, 4])
+
+######################################################################
+# Simpler Code with MaskedTensor
+# ------------------------------
+#
+# Before we get too far in the weeds, let's introduce the problem a bit more concretely. We will be taking a look
+# into the `Adagrad (functional) <https://github.com/pytorch/pytorch/blob/6c2f235d368b697072699e5ca9485fd97d0b9bcc/torch/optim/_functional.py#L16-L51>`__
+# implementation in PyTorch with the ultimate goal of simplifying and more faithfully representing the masked approach.
+#
+# For reference, this is the regular, dense code path without masked gradients or sparsity:
+#
+# .. code-block:: python
+#
+#     state_sum.addcmul_(grad, grad, value=1)
+#     std = state_sum.sqrt().add_(eps)
+#     param.addcdiv_(grad, std, value=-clr)
+#
+# The vanilla tensor implementation for sparse is:
+#
+# .. code-block:: python
+#
+#     def _make_sparse(grad, grad_indices, values):
+#         size = grad.size()
+#         if grad_indices.numel() == 0 or values.numel() == 0:
+#             return torch.empty_like(grad)
+#         return torch.sparse_coo_tensor(grad_indices, values, size)
+#
+#     grad = grad.coalesce()  # the update is non-linear so indices must be unique
+#     grad_indices = grad._indices()
+#     grad_values = grad._values()
+#
+#     state_sum.add_(_make_sparse(grad, grad_indices, grad_values.pow(2)))   # a different _make_sparse per layout
+#     std = state_sum.sparse_mask(grad)
+#     std_values = std._values().sqrt_().add_(eps)
+#     param.add_(_make_sparse(grad, grad_indices, grad_values / std_values), alpha=-clr)
+#
+# while :class:`MaskedTensor` minimizes the code to the snippet:
+#
+# .. code-block:: python
+#
+#     state_sum2 = state_sum2 + masked_grad.pow(2).get_data()
+#     std2 = masked_tensor(state_sum2.to_sparse(), mask)
+#     std2 = std2.sqrt().add(eps)
+#     param2 = param2.add((masked_grad / std2).get_data(), alpha=-clr)
+#
+# In this tutorial, we will go through each implementation line by line, but at first glance, we can notice
+# (1) how much shorter the MaskedTensor implementation is, and
+# (2) how it avoids conversions between dense and sparse tensors.
+#
+
+######################################################################
+# Original Sparse Implementation
+# ------------------------------
+#
+# Now, let's break down the code with some inline comments:
+#
+
+def _make_sparse(grad, grad_indices, values):
+    size = grad.size()
+    if grad_indices.numel() == 0 or values.numel() == 0:
+        return torch.empty_like(grad)
+    return torch.sparse_coo_tensor(grad_indices, values, size)
+
+# We don't support sparse gradients
+param = torch.arange(8).reshape(2, 4).float()
+state_sum = torch.full_like(param, 0.5)  # initial value for state sum
+
+grad = grad.coalesce()  # the update is non-linear so indices must be unique
+grad_indices = grad._indices()
+grad_values = grad._values()
+# pow(2) has the same semantics for both sparse and dense memory layouts since 0^2 is zero
+state_sum.add_(_make_sparse(grad, grad_indices, grad_values.pow(2)))
+
+# We take care to make std sparse, even though state_sum clearly is not.
+# This means that we're only applying the gradient to parts of the state_sum
+# for which it is specified. This further drives the point home that the passed gradient is not sparse, but masked.
+# We currently dodge all these concerns using the private method `_values`.
+std = state_sum.sparse_mask(grad)
+std_values = std._values().sqrt_().add_(eps)
+
+# Note here that we currently don't support div for sparse Tensors because zero / zero is not well defined,
+# so we're forced to perform `grad_values / std_values` outside the sparse semantic and then convert back to a
+# sparse tensor with `make_sparse`.
+# We'll later see that MaskedTensor will actually handle these operations for us as well as properly denote
+# undefined / undefined = undefined!
+param.add_(_make_sparse(grad, grad_indices, grad_values / std_values), alpha=-clr)
+
+######################################################################
+# The third to last line -- `std = state_sum.sparse_mask(grad)` -- is where we have a very important divergence.
+#
+# The addition of eps should technically be applied to all values but instead is only applied to specified values.
+# Here we're using sparsity as a semantic extension and to enforce a certain pattern of defined and undefined values.
+# If parts of the values of the gradient are zero, they are still included if materialized even though they
+# could be compressed by other sparse storage layouts. This is theoretically quite brittle!
+# That said, one could argue that eps is always very small, so it might not matter so much in practice.
+#
+# Moreover, an implementation `add_` for sparsity as a storage layout and compression scheme
+# should cause densification, but we force it not to for performance.
+# For this one-off case it is fine.. until we want to introduce new compression scheme, such as
+# `CSC <https://pytorch.org/docs/master/sparse.html#sparse-csc-docs>`__,
+# `BSR <https://pytorch.org/docs/master/sparse.html#sparse-bsr-docs>`__,
+# or `BSC <https://pytorch.org/docs/master/sparse.html#sparse-bsc-docs>`__.
+# We will then need to introduce separate Tensor types for each and write variations for gradients compressed
+# using different storage formats, which is inconvenient and not quite scalable nor clean.
+#
+# MaskedTensor Sparse Implementation
+# ----------------------------------
+#
+# We've been conflating sparsity as an optimization with sparsity as a semantic extension to PyTorch.
+# MaskedTensor proposes to disentangle the sparsity optimization from the semantic extension; for example,
+# currently we can't have dense semantics with sparse storage or masked semantics with dense storage.
+# MaskedTensor enables these ideas by purposefully separating the storage from the semantics.
+#
+# Consider the above example using a masked gradient:
+#
+
+# Let's now import MaskedTensor!
+from torch.masked import masked_tensor
+
+# Create an entirely new set of parameters to avoid errors
+param2 = torch.arange(8).reshape(2, 4).float()
+state_sum2 = torch.full_like(param, 0.5)  # initial value for state sum
+
+mask = (grad.to_dense() != 0).to_sparse()
+masked_grad = masked_tensor(grad, mask)
+
+state_sum2 = state_sum2 + masked_grad.pow(2).get_data()
+std2 = masked_tensor(state_sum2.to_sparse(), mask)
+
+# We can add support for in-place operations later. Notice how this doesn't
+# need to access any storage internals and is in general a lot shorter
+std2 = std2.sqrt().add(eps)
+
+param2 = param2.add((masked_grad / std2).get_data(), alpha=-clr)
+
+######################################################################
+# Note that the implementations look quite similar, but the MaskedTensor implementation is shorter and simpler.
+# In particular, much of the boilerplate code around ``_make_sparse``
+# (and needing to have a separate implementation per layout) is handled for the user with :class:`MaskedTensor`.
+#
+# At this point, let's print both this version and original version for easier comparison:
+#
+
+print("state_sum:\n", state_sum)
+print("state_sum2:\n", state_sum2)
+
+######################################################################
+#
+
+print("std:\n", std)
+print("std2:\n", std2)
+
+######################################################################
+#
+
+print("param:\n", param)
+print("param2:\n", param2)
+
+######################################################################
+# Conclusion
+# ----------
+#
+# In this tutorial, we've discussed how native masked semantics can enable a cleaner developer experience for
+# Adagrad's existing implementation in PyTorch, which used sparsity as a proxy for writing masked semantics.
+# But more importantly, allowing masked semantics to be a first class citizen through MaskedTensor
+# removes the reliance on sparsity or unreliable hacks to mimic masking, thereby allowing for proper independence
+# and development, while enabling sparse semantics, such as this one.
+#
+# Further Reading
+# ---------------
+#
+# To continue learning more, you can find our final review (for now) on
+# `MaskedTensor Advanced Semantics <https://pytorch.org/tutorials/prototype/maskedtensor_advanced_semantics.html>`__
+# to see some of the differences in design decisions between :class:`MaskedTensor` and NumPy's MaskedArray, as well
+# as reduction semantics.
+#
diff --git a/unstable_source/maskedtensor_advanced_semantics.py b/unstable_source/maskedtensor_advanced_semantics.py
new file mode 100644
index 00000000000..3517691611d
--- /dev/null
+++ b/unstable_source/maskedtensor_advanced_semantics.py
@@ -0,0 +1,170 @@
+# -*- coding: utf-8 -*-
+
+"""
+MaskedTensor Advanced Semantics
+===========================================
+"""
+
+######################################################################
+# 
+# Before working on this tutorial, please make sure to review our
+# `MaskedTensor Overview tutorial <https://pytorch.org/tutorials/prototype/maskedtensor_overview.html>`.
+#
+# The purpose of this tutorial is to help users understand how some of the advanced semantics work
+# and how they came to be. We will focus on two particular ones:
+#
+# *. Differences between MaskedTensor and `NumPy's MaskedArray <https://numpy.org/doc/stable/reference/maskedarray.html>`__  
+# *. Reduction semantics
+#
+# Preparation
+# -----------
+#
+
+import torch
+from torch.masked import masked_tensor
+import numpy as np
+import warnings
+
+# Disable prototype warnings and such
+warnings.filterwarnings(action='ignore', category=UserWarning)
+
+######################################################################
+# MaskedTensor vs NumPy's MaskedArray
+# -----------------------------------
+#
+# NumPy's ``MaskedArray`` has a few fundamental semantics differences from MaskedTensor.
+#
+# *. Their factory function and basic definition inverts the mask (similar to ``torch.nn.MHA``); that is, MaskedTensor
+#    uses ``True`` to denote "specified" and ``False`` to denote "unspecified", or "valid"/"invalid",
+#    whereas NumPy does the opposite. We believe that our mask definition is not only more intuitive,
+#    but it also aligns more with the existing semantics in PyTorch as a whole.
+# *. Intersection semantics. In NumPy, if one of two elements are masked out, the resulting element will be
+#    masked out as well -- in practice, they
+#    `apply the logical_or operator <https://github.com/numpy/numpy/blob/68299575d8595d904aff6f28e12d21bf6428a4ba/numpy/ma/core.py#L1016-L1024>`__.
+#
+
+data = torch.arange(5.)
+mask = torch.tensor([True, True, False, True, False])
+npm0 = np.ma.masked_array(data.numpy(), (~mask).numpy())
+npm1 = np.ma.masked_array(data.numpy(), (mask).numpy())
+
+print("npm0:\n", npm0)
+print("npm1:\n", npm1)
+print("npm0 + npm1:\n", npm0 + npm1)
+
+######################################################################
+# Meanwhile, MaskedTensor does not support addition or binary operators with masks that don't match --
+# to understand why, please find the :ref:`section on reductions <reduction-semantics>`.
+#
+
+mt0 = masked_tensor(data, mask)
+mt1 = masked_tensor(data, ~mask)
+print("mt0:\n", mt0)
+print("mt1:\n", mt1)
+
+try:
+    mt0 + mt1
+except ValueError as e:
+    print ("mt0 + mt1 failed. Error: ", e)
+
+######################################################################
+# However, if this behavior is desired, MaskedTensor does support these semantics by giving access to the data and masks
+# and conveniently converting a MaskedTensor to a Tensor with masked values filled in using :func:`to_tensor`.
+# For example:
+#
+
+t0 = mt0.to_tensor(0)
+t1 = mt1.to_tensor(0)
+mt2 = masked_tensor(t0 + t1, mt0.get_mask() & mt1.get_mask())
+
+print("t0:\n", t0)
+print("t1:\n", t1)
+print("mt2 (t0 + t1):\n", mt2)
+
+######################################################################
+# Note that the mask is `mt0.get_mask() & mt1.get_mask()` since :class:`MaskedTensor`'s mask is the inverse of NumPy's.
+#
+# .. _reduction-semantics:
+#
+# Reduction Semantics
+# -------------------
+#
+# Recall in `MaskedTensor's Overview tutorial <https://pytorch.org/tutorials/prototype/maskedtensor_overview.html>`__
+# we discussed "Implementing missing torch.nan* ops". Those are examples of reductions -- operators that remove one
+# (or more) dimensions from a Tensor and then aggregate the result. In this section, we will use reduction semantics
+# to motivate our strict requirements around matching masks from above.
+#
+# Fundamentally, :class:`MaskedTensor`s perform the same reduction operation while ignoring the masked out
+# (unspecified) values. By way of example:
+#
+
+data = torch.arange(12, dtype=torch.float).reshape(3, 4)
+mask = torch.randint(2, (3, 4), dtype=torch.bool)
+mt = masked_tensor(data, mask)
+
+print("data:\n", data)
+print("mask:\n", mask)
+print("mt:\n", mt)
+
+######################################################################
+# Now, the different reductions (all on dim=1):
+#
+
+print("torch.sum:\n", torch.sum(mt, 1))
+print("torch.mean:\n", torch.mean(mt, 1))
+print("torch.prod:\n", torch.prod(mt, 1))
+print("torch.amin:\n", torch.amin(mt, 1))
+print("torch.amax:\n", torch.amax(mt, 1))
+
+######################################################################
+# Of note, the value under a masked out element is not guaranteed to have any specific value, especially if the
+# row or column is entirely masked out (the same is true for normalizations).
+# For more details on masked semantics, you can find this `RFC <https://github.com/pytorch/rfcs/pull/27>`__.
+#
+# Now, we can revisit the question: why do we enforce the invariant that masks must match for binary operators?
+# In other words, why don't we use the same semantics as ``np.ma.masked_array``? Consider the following example:
+#
+
+data0 = torch.arange(10.).reshape(2, 5)
+data1 = torch.arange(10.).reshape(2, 5) + 10
+mask0 = torch.tensor([[True, True, False, False, False], [False, False, False, True, True]])
+mask1 = torch.tensor([[False, False, False, True, True], [True, True, False, False, False]])
+npm0 = np.ma.masked_array(data0.numpy(), (mask0).numpy())
+npm1 = np.ma.masked_array(data1.numpy(), (mask1).numpy())
+
+print("npm0:", npm0)
+print("npm1:", npm1)
+
+######################################################################
+# Now, let's try addition:
+#
+
+print("(npm0 + npm1).sum(0):\n", (npm0 + npm1).sum(0))
+print("npm0.sum(0) + npm1.sum(0):\n", npm0.sum(0) + npm1.sum(0))
+
+######################################################################
+# Sum and addition should clearly be associative, but with NumPy's semantics, they are not,
+# which can certainly be confusing for the user.
+#
+# :class:`MaskedTensor`, on the other hand, will simply not allow this operation since `mask0 != mask1`.
+# That being said, if the user wishes, there are ways around this
+# (for example, filling in the MaskedTensor's undefined elements with 0 values using :func:`to_tensor`
+# like shown below), but the user must now be more explicit with their intentions.
+#
+
+mt0 = masked_tensor(data0, ~mask0)
+mt1 = masked_tensor(data1, ~mask1)
+
+(mt0.to_tensor(0) + mt1.to_tensor(0)).sum(0)
+
+######################################################################
+# Conclusion
+# ----------
+#
+# In this tutorial, we have learned about the different design decisions behind MaskedTensor and
+# NumPy's MaskedArray, as well as reduction semantics.
+# In general, MaskedTensor is designed to avoid ambiguity and confusing semantics (for example, we try to preserve
+# the associative property amongst binary operations), which in turn can necessitate the user
+# to be more intentional with their code at times, but we believe this to be the better move.
+# If you have any thoughts on this, please `let us know <https://github.com/pytorch/pytorch/issues>`__!
+# 
diff --git a/unstable_source/maskedtensor_overview.py b/unstable_source/maskedtensor_overview.py
new file mode 100644
index 00000000000..955268e0d76
--- /dev/null
+++ b/unstable_source/maskedtensor_overview.py
@@ -0,0 +1,333 @@
+# -*- coding: utf-8 -*-
+
+"""
+MaskedTensor Overview
+*********************************
+"""
+
+######################################################################
+# This tutorial is designed to serve as a starting point for using MaskedTensors
+# and discuss its masking semantics.
+#
+# MaskedTensor serves as an extension to :class:`torch.Tensor` that provides the user with the ability to:
+#
+# * use any masked semantics (for example, variable length tensors, nan* operators, etc.)
+# * differentiation between 0 and NaN gradients
+# * various sparse applications (see tutorial below)
+#
+# For a more detailed introduction on what MaskedTensors are, please find the
+# `torch.masked documentation <https://pytorch.org/docs/master/masked.html>`__.
+#
+# Using MaskedTensor
+# ==================
+#
+# In this section we discuss how to use MaskedTensor including how to construct, access, the data
+# and mask, as well as indexing and slicing.
+#
+# Preparation
+# -----------
+#
+# We'll begin by doing the necessary setup for the tutorial:
+#
+
+import torch
+from torch.masked import masked_tensor, as_masked_tensor
+import warnings
+
+# Disable prototype warnings and such
+warnings.filterwarnings(action='ignore', category=UserWarning)
+
+######################################################################
+# Construction
+# ------------
+#
+# There are a few different ways to construct a MaskedTensor:
+#
+# * The first way is to directly invoke the MaskedTensor class
+# * The second (and our recommended way) is to use :func:`masked.masked_tensor` and :func:`masked.as_masked_tensor`
+#   factory functions, which are analogous to :func:`torch.tensor` and :func:`torch.as_tensor`
+#
+# Throughout this tutorial, we will be assuming the import line: `from torch.masked import masked_tensor`.
+#
+# Accessing the data and mask
+# ---------------------------
+#
+# The underlying fields in a MaskedTensor can be accessed through:
+#
+# * the :meth:`MaskedTensor.get_data` function
+# * the :meth:`MaskedTensor.get_mask` function. Recall that ``True`` indicates "specified" or "valid"
+#   while ``False`` indicates "unspecified" or "invalid".
+#
+# In general, the underlying data that is returned may not be valid in the unspecified entries, so we recommend that
+# when users require a Tensor without any masked entries, that they use :meth:`MaskedTensor.to_tensor` (as shown above) to
+# return a Tensor with filled values.
+#
+# Indexing and slicing
+# --------------------
+#
+# :class:`MaskedTensor` is a Tensor subclass, which means that it inherits the same semantics for indexing and slicing
+# as :class:`torch.Tensor`. Below are some examples of common indexing and slicing patterns:
+#
+
+data = torch.arange(24).reshape(2, 3, 4)
+mask = data % 2 == 0
+
+print("data:\n", data)
+print("mask:\n", mask)
+
+######################################################################
+#
+
+# float is used for cleaner visualization when being printed
+mt = masked_tensor(data.float(), mask)
+
+print("mt[0]:\n", mt[0])
+print("mt[:, :, 2:4]:\n", mt[:, :, 2:4])
+
+######################################################################
+# Why is MaskedTensor useful?
+# ===========================
+#
+# Because of :class:`MaskedTensor`'s treatment of specified and unspecified values as a first-class citizen
+# instead of an afterthought (with filled values, nans, etc.), it is able to solve for several of the shortcomings
+# that regular Tensors are unable to; indeed, :class:`MaskedTensor` was born in a large part due to these recurring issues.
+#
+# Below, we will discuss some of the most common issues that are still unresolved in PyTorch today
+# and illustrate how :class:`MaskedTensor` can solve these problems.
+#
+# Distinguishing between 0 and NaN gradient
+# -----------------------------------------
+#
+# One issue that :class:`torch.Tensor` runs into is the inability to distinguish between gradients that are
+# undefined (NaN) vs. gradients that are actually 0. Because PyTorch does not have a way of marking a value
+# as specified/valid vs. unspecified/invalid, it is forced to rely on NaN or 0 (depending on the use case), leading
+# to unreliable semantics since many operations aren't meant to handle NaN values properly. What is even more confusing
+# is that sometimes depending on the order of operations, the gradient could vary (for example, depending on how early
+# in the chain of operations a NaN value manifests).
+#
+# :class:`MaskedTensor` is the perfect solution for this!
+#
+# torch.where
+# ^^^^^^^^^^^
+#
+# In `Issue 10729 <https://github.com/pytorch/pytorch/issues/10729>`__, we notice a case where the order of operations
+# can matter when using :func:`torch.where` because we have trouble differentiating between if the 0 is a real 0
+# or one from undefined gradients. Therefore, we remain consistent and mask out the results:
+#
+# Current result:
+#
+
+x = torch.tensor([-10., -5, 0, 5, 10, 50, 60, 70, 80, 90, 100], requires_grad=True, dtype=torch.float)
+y = torch.where(x < 0, torch.exp(x), torch.ones_like(x))
+y.sum().backward()
+x.grad
+
+######################################################################
+# :class:`MaskedTensor` result:
+#
+
+x = torch.tensor([-10., -5, 0, 5, 10, 50, 60, 70, 80, 90, 100])
+mask = x < 0
+mx = masked_tensor(x, mask, requires_grad=True)
+my = masked_tensor(torch.ones_like(x), ~mask, requires_grad=True)
+y = torch.where(mask, torch.exp(mx), my)
+y.sum().backward()
+mx.grad
+
+######################################################################
+# The gradient here is only provided to the selected subset. Effectively, this changes the gradient of `where`
+# to mask out elements instead of setting them to zero.
+#
+# Another torch.where
+# ^^^^^^^^^^^^^^^^^^^
+#
+# `Issue 52248 <https://github.com/pytorch/pytorch/issues/52248>`__ is another example.
+#
+# Current result:
+#
+
+a = torch.randn((), requires_grad=True)
+b = torch.tensor(False)
+c = torch.ones(())
+print("torch.where(b, a/0, c):\n", torch.where(b, a/0, c))
+print("torch.autograd.grad(torch.where(b, a/0, c), a):\n", torch.autograd.grad(torch.where(b, a/0, c), a))
+
+######################################################################
+# :class:`MaskedTensor` result:
+#
+
+a = masked_tensor(torch.randn(()), torch.tensor(True), requires_grad=True)
+b = torch.tensor(False)
+c = torch.ones(())
+print("torch.where(b, a/0, c):\n", torch.where(b, a/0, c))
+print("torch.autograd.grad(torch.where(b, a/0, c), a):\n", torch.autograd.grad(torch.where(b, a/0, c), a))
+
+######################################################################
+# This issue is similar (and even links to the next issue below) in that it expresses frustration with
+# unexpected behavior because of the inability to differentiate "no gradient" vs "zero gradient",
+# which in turn makes working with other ops difficult to reason about.
+#
+# When using mask, x/0 yields NaN grad
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+#
+# In `Issue 4132 <https://github.com/pytorch/pytorch/issues/4132>`__, the user proposes that
+# `x.grad` should be `[0, 1]` instead of the `[nan, 1]`,
+# whereas :class:`MaskedTensor` makes this very clear by masking out the gradient altogether.
+#
+# Current result:
+#
+
+x = torch.tensor([1., 1.], requires_grad=True)
+div = torch.tensor([0., 1.])
+y = x/div # => y is [inf, 1]
+mask = (div != 0)  # => mask is [0, 1]
+y[mask].backward()
+x.grad
+
+######################################################################
+# :class:`MaskedTensor` result:
+#
+
+x = torch.tensor([1., 1.], requires_grad=True)
+div = torch.tensor([0., 1.])
+y = x/div # => y is [inf, 1]
+mask = (div != 0) # => mask is [0, 1]
+loss = as_masked_tensor(y, mask)
+loss.sum().backward()
+x.grad
+
+######################################################################
+# :func:`torch.nansum` and :func:`torch.nanmean`
+# ----------------------------------------------
+#
+# In `Issue 67180 <https://github.com/pytorch/pytorch/issues/67180>`__,
+# the gradient isn't calculate properly (a longstanding issue), whereas :class:`MaskedTensor` handles it correctly.
+#
+# Current result:
+#
+
+a = torch.tensor([1., 2., float('nan')])
+b = torch.tensor(1.0, requires_grad=True)
+c = a * b
+c1 = torch.nansum(c)
+bgrad1, = torch.autograd.grad(c1, b, retain_graph=True)
+bgrad1
+
+######################################################################
+# :class:`MaskedTensor` result:
+#
+
+a = torch.tensor([1., 2., float('nan')])
+b = torch.tensor(1.0, requires_grad=True)
+mt = masked_tensor(a, ~torch.isnan(a))
+c = mt * b
+c1 = torch.sum(c)
+bgrad1, = torch.autograd.grad(c1, b, retain_graph=True)
+bgrad1
+
+######################################################################
+# Safe Softmax
+# ------------
+#
+# Safe softmax is another great example of `an issue <https://github.com/pytorch/pytorch/issues/55056>`__
+# that arises frequently. In a nutshell, if there is an entire batch that is "masked out"
+# or consists entirely of padding (which, in the softmax case, translates to being set `-inf`),
+# then this will result in NaNs, which can lead to training divergence.
+#
+# Luckily, :class:`MaskedTensor` has solved this issue. Consider this setup:
+#
+
+data = torch.randn(3, 3)
+mask = torch.tensor([[True, False, False], [True, False, True], [False, False, False]])
+x = data.masked_fill(~mask, float('-inf'))
+mt = masked_tensor(data, mask)
+print("x:\n", x)
+print("mt:\n", mt)
+
+######################################################################
+# For example, we want to calculate the softmax along `dim=0`. Note that the second column is "unsafe" (i.e. entirely
+# masked out), so when the softmax is calculated, the result will yield `0/0 = nan` since `exp(-inf) = 0`.
+# However, what we would really like is for the gradients to be masked out since they are unspecified and would be
+# invalid for training.
+#
+# PyTorch result:
+#
+
+x.softmax(0)
+
+######################################################################
+# :class:`MaskedTensor` result:
+#
+
+mt.softmax(0)
+
+######################################################################
+# Implementing missing torch.nan* operators
+# -----------------------------------------
+#
+# In `Issue 61474 <https://github.com/pytorch/pytorch/issues/61474>`__,
+# there is a request to add additional operators to cover the various `torch.nan*` applications,
+# such as ``torch.nanmax``, ``torch.nanmin``, etc.
+#
+# In general, these problems lend themselves more naturally to masked semantics, so instead of introducing additional
+# operators, we propose using :class:`MaskedTensor` instead.
+# Since `nanmean has already landed <https://github.com/pytorch/pytorch/issues/21987>`__,
+# we can use it as a comparison point:
+#
+
+x = torch.arange(16).float()
+y = x * x.fmod(4)
+z = y.masked_fill(y == 0, float('nan'))  # we want to get the mean of y when ignoring the zeros
+
+######################################################################
+#
+print("y:\n", y)
+# z is just y with the zeros replaced with nan's
+print("z:\n", z)
+
+######################################################################
+#
+
+print("y.mean():\n", y.mean())
+print("z.nanmean():\n", z.nanmean())
+# MaskedTensor successfully ignores the 0's
+print("torch.mean(masked_tensor(y, y != 0)):\n", torch.mean(masked_tensor(y, y != 0)))
+
+######################################################################
+# In the above example, we've constructed a `y` and would like to calculate the mean of the series while ignoring
+# the zeros. `torch.nanmean` can be used to do this, but we don't have implementations for the rest of the
+# `torch.nan*` operations. :class:`MaskedTensor` solves this issue by being able to use the base operation,
+# and we already have support for the other operations listed in the issue. For example:
+#
+
+torch.argmin(masked_tensor(y, y != 0))
+
+######################################################################
+# Indeed, the index of the minimum argument when ignoring the 0's is the 1 in index 1.
+#
+# :class:`MaskedTensor` can also support reductions when the data is fully masked out, which is equivalent
+# to the case above when the data Tensor is completely ``nan``. ``nanmean`` would return ``nan``
+# (an ambiguous return value), while MaskedTensor would more accurately indicate a masked out result.
+#
+
+x = torch.empty(16).fill_(float('nan'))
+print("x:\n", x)
+print("torch.nanmean(x):\n", torch.nanmean(x))
+print("torch.nanmean via maskedtensor:\n", torch.mean(masked_tensor(x, ~torch.isnan(x))))
+
+######################################################################
+# This is a similar problem to safe softmax where `0/0 = nan` when what we really want is an undefined value.
+#
+# Conclusion
+# ==========
+#
+# In this tutorial, we've introduced what MaskedTensors are, demonstrated how to use them, and motivated their
+# value through a series of examples and issues that they've helped resolve.
+#
+# Further Reading
+# ===============
+#
+# To continue learning more, you can find our
+# `MaskedTensor Sparsity tutorial <https://pytorch.org/tutorials/prototype/maskedtensor_sparsity.html>`__
+# to see how MaskedTensor enables sparsity and the different storage formats we currently support.
+#
diff --git a/unstable_source/maskedtensor_sparsity.py b/unstable_source/maskedtensor_sparsity.py
new file mode 100644
index 00000000000..a1353805f1d
--- /dev/null
+++ b/unstable_source/maskedtensor_sparsity.py
@@ -0,0 +1,315 @@
+# -*- coding: utf-8 -*-
+
+"""
+MaskedTensor Sparsity
+=================================
+"""
+
+######################################################################
+# Before working on this tutorial, please make sure to review our
+# `MaskedTensor Overview tutorial <https://pytorch.org/tutorials/prototype/maskedtensor_overview.html>`.
+#
+# Introduction
+# ------------
+#
+# Sparsity has been an area of rapid growth and importance within PyTorch; if any sparsity terms are confusing below,
+# please refer to the `sparsity tutorial <https://pytorch.org/docs/stable/sparse.html>`__ for additional details.
+#
+# Sparse storage formats have been proven to be powerful in a variety of ways. As a primer, the first use case
+# most practitioners think about is when the majority of elements are equal to zero (a high degree of sparsity),
+# but even in cases of lower sparsity, certain formats (e.g. BSR) can take advantage of substructures within a matrix.
+#
+# .. note::
+#
+#     At the moment, MaskedTensor supports COO and CSR tensors with plans to support additional formats
+#     (such as BSR and CSC) in the future. If you have any requests for additional formats,
+#     please file a feature request `here <https://github.com/pytorch/pytorch/issues>`__!
+#
+# Principles
+# ----------
+#
+# When creating a :class:`MaskedTensor` with sparse tensors, there are a few principles that must be observed:
+#
+# 1. ``data`` and ``mask`` must have the same storage format, whether that's :attr:`torch.strided`, :attr:`torch.sparse_coo`, or :attr:`torch.sparse_csr`
+# 2. ``data`` and ``mask`` must have the same size, indicated by :func:`size()`
+#
+# .. _sparse-coo-tensors:
+#
+# Sparse COO tensors
+# ------------------
+#
+# In accordance with Principle #1, a sparse COO MaskedTensor is created by passing in two sparse COO tensors,
+# which can be initialized by any of its constructors, for example :func:`torch.sparse_coo_tensor`.
+#
+# As a recap of `sparse COO tensors <https://pytorch.org/docs/stable/sparse.html#sparse-coo-tensors>`__, the COO format
+# stands for "coordinate format", where the specified elements are stored as tuples of their indices and the
+# corresponding values. That is, the following are provided:
+#
+# * ``indices``: array of size ``(ndim, nse)`` and dtype ``torch.int64``
+# * ``values``: array of size `(nse,)` with any integer or floating point dtype
+#
+# where ``ndim`` is the dimensionality of the tensor and ``nse`` is the number of specified elements.
+#
+# For both sparse COO and CSR tensors, you can construct a :class:`MaskedTensor` by doing either:
+#
+# 1. ``masked_tensor(sparse_tensor_data, sparse_tensor_mask)``
+# 2. ``dense_masked_tensor.to_sparse_coo()`` or ``dense_masked_tensor.to_sparse_csr()``
+#
+# The second method is easier to illustrate so we've shown that below, but for more on the first and the nuances behind
+# the approach, please read the :ref:`Sparse COO Appendix <sparse-coo-appendix>`.
+#
+
+import torch
+from torch.masked import masked_tensor
+import warnings
+
+# Disable prototype warnings and such
+warnings.filterwarnings(action='ignore', category=UserWarning)
+
+values = torch.tensor([[0, 0, 3], [4, 0, 5]])
+mask = torch.tensor([[False, False, True], [False, False, True]])
+mt = masked_tensor(values, mask)
+sparse_coo_mt = mt.to_sparse_coo()
+
+print("mt:\n", mt)
+print("mt (sparse coo):\n", sparse_coo_mt)
+print("mt data (sparse coo):\n", sparse_coo_mt.get_data())
+
+######################################################################
+# Sparse CSR tensors
+# ------------------
+#
+# Similarly, :class:`MaskedTensor` also supports the
+# `CSR (Compressed Sparse Row) <https://pytorch.org/docs/stable/sparse.html#sparse-csr-tensor>`__
+# sparse tensor format. Instead of storing the tuples of the indices like sparse COO tensors, sparse CSR tensors
+# aim to decrease the memory requirements by storing compressed row indices.
+# In particular, a CSR sparse tensor consists of three 1-D tensors:
+#
+# * ``crow_indices``: array of compressed row indices with size ``(size[0] + 1,)``. This array indicates which row
+#   a given entry in values lives in. The last element is the number of specified elements,
+#   while `crow_indices[i+1] - crow_indices[i]` indicates the number of specified elements in row i.
+# * ``col_indices``: array of size ``(nnz,)``. Indicates the column indices for each value.
+# * ``values``: array of size ``(nnz,)``. Contains the values of the CSR tensor.
+#
+# Of note, both sparse COO and CSR tensors are in a `beta <https://pytorch.org/docs/stable/index.html>`__ state.
+#
+# By way of example:
+#
+
+mt_sparse_csr = mt.to_sparse_csr()
+
+print("mt (sparse csr):\n", mt_sparse_csr)
+print("mt data (sparse csr):\n", mt_sparse_csr.get_data())
+
+######################################################################
+# Supported Operations
+# --------------------
+#
+# Unary
+# ^^^^^
+# All `unary operators <https://pytorch.org/docs/master/masked.html#unary-operators>`__ are supported, e.g.:
+#
+
+mt.sin()
+
+######################################################################
+# Binary
+# ^^^^^^
+# `Binary operators <https://pytorch.org/docs/master/masked.html#unary-operators>`__ are also supported, but the
+# input masks from the two masked tensors must match. For more information on why this decision was made, please
+# find our `MaskedTensor: Advanced Semantics tutorial <https://pytorch.org/tutorials/prototype/maskedtensor_advanced_semantics.html>`__.
+#
+# Please find an example below:
+#
+
+i = [[0, 1, 1],
+     [2, 0, 2]]
+v1 = [3, 4, 5]
+v2 = [20, 30, 40]
+m = torch.tensor([True, False, True])
+
+s1 = torch.sparse_coo_tensor(i, v1, (2, 3))
+s2 = torch.sparse_coo_tensor(i, v2, (2, 3))
+mask = torch.sparse_coo_tensor(i, m, (2, 3))
+
+mt1 = masked_tensor(s1, mask)
+mt2 = masked_tensor(s2, mask)
+
+print("mt1:\n", mt1)
+print("mt2:\n", mt2)
+
+######################################################################
+#
+
+print("torch.div(mt2, mt1):\n", torch.div(mt2, mt1))
+print("torch.mul(mt1, mt2):\n", torch.mul(mt1, mt2))
+
+######################################################################
+# Reductions
+# ^^^^^^^^^^
+# Finally, `reductions <https://pytorch.org/docs/master/masked.html#reductions>`__ are supported:
+#
+
+mt
+
+######################################################################
+#
+
+print("mt.sum():\n", mt.sum())
+print("mt.sum(dim=1):\n", mt.sum(dim=1))
+print("mt.amin():\n", mt.amin())
+
+######################################################################
+# MaskedTensor Helper Methods
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+# For convenience, :class:`MaskedTensor` has a number of methods to help convert between the different layouts
+# and identify the current layout:
+#
+# Setup:
+#
+
+v = [[3, 0, 0],
+     [0, 4, 5]]
+m = [[True, False, False],
+     [False, True, True]]
+
+mt = masked_tensor(torch.tensor(v), torch.tensor(m))
+mt
+
+######################################################################
+# :meth:`MaskedTensor.to_sparse_coo()` / :meth:`MaskedTensor.to_sparse_csr()` / :meth:`MaskedTensor.to_dense()`
+# to help convert between the different layouts.
+#
+
+mt_sparse_coo = mt.to_sparse_coo()
+mt_sparse_csr = mt.to_sparse_csr()
+mt_dense = mt_sparse_coo.to_dense()
+
+######################################################################
+# :meth:`MaskedTensor.is_sparse` -- this will check if the :class:`MaskedTensor`'s layout
+# matches any of the supported sparse layouts (currently COO and CSR).
+#
+
+print("mt_dense.is_sparse: ", mt_dense.is_sparse)
+print("mt_sparse_coo.is_sparse: ", mt_sparse_coo.is_sparse)
+print("mt_sparse_csr.is_sparse: ", mt_sparse_csr.is_sparse)
+
+######################################################################
+# :meth:`MaskedTensor.is_sparse_coo()`
+#
+
+print("mt_dense.is_sparse_coo(): ", mt_dense.is_sparse_coo())
+print("mt_sparse_coo.is_sparse_coo: ", mt_sparse_coo.is_sparse_coo())
+print("mt_sparse_csr.is_sparse_coo: ", mt_sparse_csr.is_sparse_coo())
+
+######################################################################
+# :meth:`MaskedTensor.is_sparse_csr()`
+#
+
+print("mt_dense.is_sparse_csr(): ", mt_dense.is_sparse_csr())
+print("mt_sparse_coo.is_sparse_csr: ", mt_sparse_coo.is_sparse_csr())
+print("mt_sparse_csr.is_sparse_csr: ", mt_sparse_csr.is_sparse_csr())
+
+######################################################################
+# Appendix
+# --------
+#
+# .. _sparse-coo-appendix:
+#
+# Sparse COO Construction
+# ^^^^^^^^^^^^^^^^^^^^^^^
+#
+# Recall in our :ref:`original example <sparse-coo-tensors>`, we created a :class:`MaskedTensor`
+# and then converted it to a sparse COO MaskedTensor with :meth:`MaskedTensor.to_sparse_coo`.
+#
+# Alternatively, we can also construct a sparse COO MaskedTensor directly by passing in two sparse COO tensors:
+#
+
+values = torch.tensor([[0, 0, 3], [4, 0, 5]]).to_sparse()
+mask = torch.tensor([[False, False, True], [False, False, True]]).to_sparse()
+mt = masked_tensor(values, mask)
+
+print("values:\n", values)
+print("mask:\n", mask)
+print("mt:\n", mt)
+
+######################################################################
+# Instead of using :meth:`torch.Tensor.to_sparse`, we can also create the sparse COO tensors directly,
+# which brings us to a warning:
+#
+# .. warning::
+#
+#   When using a function like :meth:`MaskedTensor.to_sparse_coo` (analogous to :meth:`Tensor.to_sparse`),
+#   if the user does not specify the indices like in the above example,
+#   then the 0 values will be "unspecified" by default.
+#
+# Below, we explicitly specify the 0's:
+#
+
+i = [[0, 1, 1],
+     [2, 0, 2]]
+v = [3, 4, 5]
+m = torch.tensor([True, False, True])
+values = torch.sparse_coo_tensor(i, v, (2, 3))
+mask = torch.sparse_coo_tensor(i, m, (2, 3))
+mt2 = masked_tensor(values, mask)
+
+print("values:\n", values)
+print("mask:\n", mask)
+print("mt2:\n", mt2)
+
+######################################################################
+# Note that ``mt`` and ``mt2`` look identical on the surface, and in the vast majority of operations, will yield the same
+# result. But this brings us to a detail on the implementation:
+#
+# ``data`` and ``mask`` -- only for sparse MaskedTensors -- can have a different number of elements (:func:`nnz`)
+# **at creation**, but the indices of ``mask`` must then be a subset of the indices of ``data``. In this case,
+# ``data`` will assume the shape of ``mask`` by ``data = data.sparse_mask(mask)``; in other words, any of the elements
+# in ``data`` that are not ``True`` in ``mask`` (that is, not specified) will be thrown away.
+#
+# Therefore, under the hood, the data looks slightly different; ``mt2`` has the "4" value masked out and ``mt``
+# is completely without it. Their underlying data has different shapes,
+# which would make operations like ``mt + mt2`` invalid.
+#
+
+print("mt data:\n", mt.get_data())
+print("mt2 data:\n", mt2.get_data())
+
+######################################################################
+# .. _sparse-csr-appendix:
+#
+# Sparse CSR Construction
+# ^^^^^^^^^^^^^^^^^^^^^^^
+#
+# We can also construct a sparse CSR MaskedTensor using sparse CSR tensors,
+# and like the example above, this results in a similar treatment under the hood.
+#
+
+crow_indices = torch.tensor([0, 2, 4])
+col_indices = torch.tensor([0, 1, 0, 1])
+values = torch.tensor([1, 2, 3, 4])
+mask_values = torch.tensor([True, False, False, True])
+
+csr = torch.sparse_csr_tensor(crow_indices, col_indices, values, dtype=torch.double)
+mask = torch.sparse_csr_tensor(crow_indices, col_indices, mask_values, dtype=torch.bool)
+mt = masked_tensor(csr, mask)
+
+print("mt:\n", mt)
+print("mt data:\n", mt.get_data())
+
+######################################################################
+# Conclusion
+# ----------
+# In this tutorial, we have introduced how to use :class:`MaskedTensor` with sparse COO and CSR formats and
+# discussed some of the subtleties under the hood in case users decide to access the underlying data structures
+# directly. Sparse storage formats and masked semantics indeed have strong synergies, so much so that they are
+# sometimes used as proxies for each other (as we will see in the next tutorial). In the future, we certainly plan
+# to invest and continue developing in this direction.
+#
+# Further Reading
+# ---------------
+#
+# To continue learning more, you can find our
+# `Efficiently writing "sparse" semantics for Adagrad with MaskedTensor tutorial <https://pytorch.org/tutorials/prototype/maskedtensor_adagrad.html>`__
+# to see an example of how MaskedTensor can simplify existing workflows with native masking semantics.
+#
diff --git a/unstable_source/max_autotune_on_CPU_tutorial.rst b/unstable_source/max_autotune_on_CPU_tutorial.rst
new file mode 100644
index 00000000000..47374744938
--- /dev/null
+++ b/unstable_source/max_autotune_on_CPU_tutorial.rst
@@ -0,0 +1,215 @@
+Using Max-Autotune Compilation on CPU for Better Performance
+================================================================================
+
+**Author**: `Jiong Gong <https://github.com/jgong5>`__, `Leslie Fang <https://github.com/leslie-fang-intel>`__, `Chunyuan Wu <https://github.com/chunyuan-w>`__
+
+In this tutorial, you will learn how to boost your PyTorch models' performance on CPU by 
+leveraging the max-autotune mode in the Inductor CPU backend. Explore the activation
+process, understand the differences from traditional methods, and integrate max-autotune
+into your code for enhanced computational efficiency. Dive into the use of advanced
+GEMM templates for faster processing and superior runtime performance.
+
+Prerequisites:
+----------------
+-  `torch.compile and TorchInductor concepts in PyTorch <https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html>`__
+
+Introduction
+------------
+The ``max-autotune`` mode for the Inductor CPU backend in ``torch.compile`` (`RFC link <https://github.com/pytorch/pytorch/issues/125683>`_)
+profiles multiple implementations of operations at compile time and selects the best-performing one,
+trading longer compilation times for improved runtime performance. This enhancement is particularly beneficial for GEMM-related operations.
+In the Inductor CPU backend, we’ve introduced a C++ template-based GEMM implementation as an alternative to the ATen-based approach that relies on oneDNN and MKL libraries.
+This is similar to the max-autotune mode on CUDA, where implementations from ATen, Triton, and CUTLASS are considered.
+
+We have covered most popular data types, including FP32, BF16, FP16, and INT8, with epilogue fusions for x86 CPUs.
+
+While the development is still in progress, we have already seen promising speedups over pure ATen-based GEMMs as measured by the three benchmark suites and the inference of LLMs. 
+
+Activating the ``max-autotune`` mode
+-------------------------------------
+To activate the ``max-autotune`` mode in PyTorch, set the ``mode`` argument to ``max-autotune`` when compiling your model using ``torch.compile``.
+If you prefer to bypass the tuning process and always use the C++ template implementations, you can configure this via an environment variable: 
+``export TORCHINDUCTOR_MAX_AUTOTUNE_GEMM_BACKENDS=CPP``.
+
+
+Example
+------------
+The below code is an example of using the ``max-autotune`` mode on a simple neural network with a linear layer followed by a ReLU activation.
+
+In the C++ template-based GEMM implementation, we will pre-pack the weight for good cache usage.
+In the case of inference which is the primary scenario of CPU AI workloads,
+model weights are constant and we pack them upfront during compilation
+so that the data accesses are contiguous within the cache blocks.
+Thus, We only support frozen model with ``torch.no_grad`` or the inference mode.
+You need to set the environment variable ``export TORCHINDUCTOR_FREEZING=1``
+and ensure that both the compilation and inference steps are executed within the ``torch.no_grad`` context.
+
+.. code:: python
+
+    import torch
+    from torch._inductor import config
+    config.trace.log_autotuning_results = True # enable the log of autotuning results
+
+    class M(torch.nn.Module):
+        def __init__(
+            self,
+            in_features,
+            out_features,
+            bias,
+            **kwargs,
+        ):
+            super().__init__()
+            self.linear = torch.nn.Linear(
+                in_features,
+                out_features,
+                bias,
+                **kwargs,
+            )
+            self.relu = torch.nn.ReLU()
+
+        def forward(self, x):
+            x = self.linear(x)
+            x = self.relu(x)
+            return x
+
+    amp_enabled = True
+    batch_size = 64
+    in_features = 16
+    out_features = 32
+    bias = True
+
+    x = torch.randn(batch_size, in_features)
+    model = M(in_features, out_features, bias)
+
+    with torch.no_grad(), torch.cpu.amp.autocast(enabled=amp_enabled):
+        compiled = torch.compile(model, mode="max-autotune") # turn on "max-autotune" mode
+        y = compiled(x)
+
+
+When running the above code snippet, you will see the autotuning result (the performance numbers are for demonstration purposes).
+In this example, C++ template outperforms ATen kernel so that it will be selected.
+
+.. code:: shell
+
+    AUTOTUNE linear_unary(64x16, 32x16, 32)
+    cpp_packed_gemm_0 0.2142 ms 100.0% 
+    _linear_pointwise 0.2441 ms 87.7% 
+
+
+We could check the generated output code by setting ``export TORCH_LOGS="+output_code"``.
+When C++ template is selected, we won't have ``torch.ops.mkldnn._linear_pointwise.default`` (for bfloat16) or ``torch.ops.mkl._mkl_linear.default`` (for float32)
+in the generated code anymore, instead, we'll find kernel based on CPP GEMM template ``cpp_fused__to_copy_relu_1``
+(only part of the code is demonstrated below for simplicity) with the bias and relu epilogues fused inside the C++ GEMM template kernel.
+
+The generated code differs by CPU architecture and is implementation-specific, which is subject to change.
+
+.. code:: python
+
+    cpp_fused__to_copy_relu_1 = async_compile.cpp_pybinding(['const bfloat16*', 'const bfloat16*', 'const bfloat16*', 'bfloat16*'], '''
+    
+    ...
+
+    template <bool accum>
+    inline void kernel_micro_gemm_amx_kernel_32_2(
+        AMXState& amx_state,
+        const bfloat16* __restrict__ A,
+        const bfloat16* __restrict__ B,
+        float* __restrict__ C,
+        int64_t K,
+        int64_t lda,
+        int64_t ldb,
+        int64_t ldc,
+        uint8_t tilecfg_rows
+    ) {
+        ...
+    }
+    
+    ...
+
+    template <bool accum>
+    inline void kernel_micro_gemm(
+        AMXState& amx_state,
+        const bfloat16* __restrict__ A,
+        const bfloat16* __restrict__ B,
+        float* __restrict__ C,
+        int64_t M,
+        int64_t N,
+        int64_t K,
+        int64_t lda,
+        int64_t ldb,
+        int64_t ldc
+    ) {
+        ...
+    }
+
+    extern "C" 
+    void kernel(const bfloat16* X, const bfloat16* W, const bfloat16* inp, bfloat16* Y)
+    {
+        constexpr int64_t num_threads = 40;
+        constexpr int64_t N = 32;
+        constexpr int64_t K = 16;
+        constexpr int64_t M = static_cast<int64_t>(64L);
+        ...
+        #pragma omp parallel num_threads(40)
+        {
+            const int tid = omp_get_thread_num();
+            ...
+            for (int64_t mc_block_id = 0; mc_block_id < num_Mc_blocks_per_thread; mc_block_id++) {
+                ...
+                for (int64_t nc = n_block_start; nc < n_block_end; nc += Nc_blocks) {
+                    ...
+                    for (int64_t kc = k_block_start; kc < k_block_end; kc += Kc_blocks) {
+                        ...
+                        for (int64_t nci = nc; nci < nc_block_end; nci++) {
+                            if (kc == k_block_start) {
+                                kernel_micro_gemm<static_cast<bool>(false)>(
+                                    ...
+                                );
+
+                            } else {
+                                kernel_micro_gemm<static_cast<bool>(true)>(
+                                    ...
+                                );
+
+                            }
+                        }
+                    }
+                    {
+                        {
+                            // Epilogue fusion here for bias and relu
+                            #pragma GCC ivdep
+                            for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(m_end + ((-1L)*m_start)); x0+=static_cast<int64_t>(1L))
+                            {
+                                for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(16L*(c10::div_floor_integer(static_cast<int64_t>((n_end + ((-1L)*n_start))), static_cast<int64_t>(16L)))); x1+=static_cast<int64_t>(16L))
+                                {
+                                    auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(inp + static_cast<int64_t>(n_start + x1), static_cast<int64_t>(16));
+                                    auto tmp2 = at::vec::Vectorized<float>::loadu(local_acc_buf + static_cast<int64_t>(x1 + (Nc_blocks*Nr*x0)), static_cast<int64_t>(16));
+                                    auto tmp1 = at::vec::convert<float>(tmp0);
+                                    auto tmp3 = tmp1 + tmp2;
+                                    auto tmp4 = at::vec::convert<bfloat16>(tmp3);
+                                    auto tmp5 = static_cast<float>(0.0);
+                                    auto tmp6 = at::vec::Vectorized<float>(tmp5);
+                                    auto tmp7 = at::vec::maximum(tmp3, tmp6);
+                                    auto tmp8 = at::vec::convert<bfloat16>(tmp7);
+                                    tmp8.store(Y + static_cast<int64_t>(n_start + x1 + (32L*m_start) + (32L*x0)), static_cast<int64_t>(16));
+                                }
+                                
+                                ...
+
+                            }
+                        }
+
+                    }
+                }
+            }
+            ...
+        }
+    }
+    ''')
+
+Conclusion
+------------
+In this tutorial, we introduced max-autotune support on CPU with GEMM template. We explained the API to activate this feature, and demonstrated
+the generated code of the GEMM template.
+
+This feature is in prototype stage. If you have any feature requests or run into any issues, please file a bug report at `GitHub issues <https://github.com/pytorch/pytorch/issues>`_.
diff --git a/unstable_source/nestedtensor.py b/unstable_source/nestedtensor.py
new file mode 100644
index 00000000000..77f8a4cebe1
--- /dev/null
+++ b/unstable_source/nestedtensor.py
@@ -0,0 +1,376 @@
+"""
+
+Getting Started with Nested Tensors
+===============================================================
+
+Nested tensors generalize the shape of regular dense tensors, allowing for representation
+of ragged-sized data.
+
+* for a regular tensor, each dimension is regular and has a size
+
+* for a nested tensor, not all dimensions have regular sizes; some of them are ragged
+
+Nested tensors are a natural solution for representing sequential data within various domains:
+
+* in NLP, sentences can have variable lengths, so a batch of sentences forms a nested tensor
+
+* in CV, images can have variable shapes, so a batch of images forms a nested tensor
+
+In this tutorial, we will demonstrate basic usage of nested tensors and motivate their usefulness
+for operating on sequential data of varying lengths with a real-world example. In particular,
+they are invaluable for building transformers that can efficiently operate on ragged sequential
+inputs. Below, we present an implementation of multi-head attention using nested tensors that,
+combined usage of ``torch.compile``, out-performs operating naively on tensors with padding.
+
+Nested tensors are currently a prototype feature and are subject to change.
+"""
+
+import numpy as np
+import timeit
+import torch
+import torch.nn.functional as F
+
+from torch import nn
+
+torch.manual_seed(1)
+np.random.seed(1)
+
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+
+######################################################################
+# Nested tensor initialization
+# ----------------------------
+#
+# From the Python frontend, a nested tensor can be created from a list of tensors.
+# We denote nt[i] as the ith tensor component of a nestedtensor.
+nt = torch.nested.nested_tensor([torch.arange(12).reshape(
+    2, 6), torch.arange(18).reshape(3, 6)], dtype=torch.float, device=device)
+print(f"{nt=}")
+
+######################################################################
+# By padding every underlying tensor to the same shape,
+# a nestedtensor can be converted to a regular tensor.
+padded_out_tensor = torch.nested.to_padded_tensor(nt, padding=0.0)
+print(f"{padded_out_tensor=}")
+
+######################################################################
+# All tensors posses an attribute for determining if they are nested;
+print(f"nt is nested: {nt.is_nested}")
+print(f"padded_out_tensor is nested: {padded_out_tensor.is_nested}")
+
+######################################################################
+# It is common to construct nestedtensors from batches of irregularly shaped tensors.
+# i.e. dimension 0 is assumed to be the batch dimension.
+# Indexing dimension 0 gives back the first underlying tensor component.
+print("First underlying tensor component:", nt[0], sep='\n')
+print("last column of 2nd underlying tensor component:", nt[1, :, -1], sep='\n')
+
+# When indexing a nestedtensor's 0th dimension, the result is a regular tensor.
+print(f"First underlying tensor component is nested: {nt[0].is_nested}")
+
+######################################################################
+# An important note is that slicing in dimension 0 has not been supported yet.
+# Which means it not currently possible to construct a view that combines the underlying
+# tensor components.
+
+######################################################################
+# Nested Tensor Operations
+# ------------------------
+#
+# As each operation must be explicitly implemented for nestedtensors,
+# operation coverage for nestedtensors is currently narrower than that of regular tensors.
+# For now, only basic operations such as index, dropout, softmax, transpose, reshape, linear, bmm are covered.
+# However, coverage is being expanded.
+# If you need certain operations, please file an `issue <https://github.com/pytorch/pytorch>`__
+# to help us prioritize coverage.
+#
+# **reshape**
+#
+# The reshape op is for changing the shape of a tensor.
+# Its full semantics for regular tensors can be found
+# `here <https://pytorch.org/docs/stable/generated/torch.reshape.html>`__.
+# For regular tensors, when specifying the new shape,
+# a single dimension may be -1, in which case it is inferred
+# from the remaining dimensions and the number of elements.
+#
+# The semantics for nestedtensors are similar, except that -1 no longer infers.
+# Instead, it inherits the old size (here 2 for ``nt[0]`` and 3 for ``nt[1]``).
+# -1 is the only legal size to specify for a jagged dimension.
+nt_reshaped = nt.reshape(2, -1, 2, 3)
+print(f"{nt_reshaped=}")
+
+######################################################################
+# **transpose**
+#
+# The transpose op is for swapping two dimensions of a tensor.
+# Its full semantics can be found
+# `here <https://pytorch.org/docs/stable/generated/torch.transpose.html>`__.
+# Note that for nestedtensors dimension 0 is special;
+# it is assumed to be the batch dimension,
+# so transposes involving nestedtensor dimension 0 are not supported.
+nt_transposed = nt_reshaped.transpose(1, 2)
+print(f"{nt_transposed=}")
+
+######################################################################
+# **others**
+#
+# Other operations have the same semantics as for regular tensors.
+# Applying the operation on a nestedtensor is equivalent to
+# applying the operation to the underlying tensor components,
+# with the result being a nestedtensor as well.
+nt_mm = torch.nested.nested_tensor([torch.randn((2, 3, 4)), torch.randn((2, 3, 5))], device=device)
+nt3 = torch.matmul(nt_transposed, nt_mm)
+print(f"Result of Matmul:\n {nt3}")
+
+nt4 = F.dropout(nt3, 0.1)
+print(f"Result of Dropout:\n {nt4}")
+
+nt5 = F.softmax(nt4, -1)
+print(f"Result of Softmax:\n {nt5}")
+
+######################################################################
+# Why Nested Tensor
+# -----------------
+#
+
+######################################################################
+# When data is sequential, it is often the case that each sample has a different length.
+# For example, in a batch of sentences, each sentence has a different number of words.
+# A common technique for handling varying sequences is to manually pad each data tensor
+# to the same shape in order to form a batch.
+# For example, we have 2 sentences with different lengths and a vocabulary
+# In order to represent his as single tensor we pad with 0 to the max length in the batch.
+sentences = [["goodbye", "padding"],
+             ["embrace", "nested", "tensor"]]
+vocabulary = {"goodbye": 1.0, "padding": 2.0,
+              "embrace": 3.0, "nested": 4.0, "tensor": 5.0}
+padded_sentences = torch.tensor([[1.0, 2.0, 0.0],
+                                 [3.0, 4.0, 5.0]])
+nested_sentences = torch.nested.nested_tensor([torch.tensor([1.0, 2.0]),
+                                               torch.tensor([3.0, 4.0, 5.0])])
+print(f"{padded_sentences=}")
+print(f"{nested_sentences=}")
+
+######################################################################
+# This technique of padding a batch of data to its max length is not optimal.
+# The padded data is not needed for computation and wastes memory by allocating
+# larger tensors than necessary.
+# Further, not all operations have the same semnatics when applied to padded data.
+# For matrix multiplications in order to ignore the padded entries, one needs to pad
+# with 0 while for softmax one has to pad with -inf to ignore specific entries.
+# The primary objective of nested tensor is to facilitate operations on ragged
+# data using the standard PyTorch tensor UX, thereby eliminating the need
+# for inefficient and complex padding and masking.
+padded_sentences_for_softmax = torch.tensor([[1.0, 2.0, float("-inf")],
+                                             [3.0, 4.0, 5.0]])
+print(F.softmax(padded_sentences_for_softmax, -1))
+print(F.softmax(nested_sentences, -1))
+
+######################################################################
+# Let us take a look at a practical example: the multi-head attention component
+# utilized in `Transformers <https://arxiv.org/pdf/1706.03762.pdf>`__.
+# We can implement this in such a way that it can operate on either padded
+# or nested tensors.
+class MultiHeadAttention(nn.Module):
+    """
+    Computes multi-head attention. Supports nested or padded tensors.
+
+    Args:
+        E_q (int): Size of embedding dim for query
+        E_k (int): Size of embedding dim for key
+        E_v (int): Size of embedding dim for value
+        E_total (int): Total embedding dim of combined heads post input projection. Each head
+            has dim E_total // nheads
+        nheads (int): Number of heads
+        dropout_p (float, optional): Dropout probability. Default: 0.0
+    """
+    def __init__(self, E_q: int, E_k: int, E_v: int, E_total: int,
+                 nheads: int, dropout_p: float = 0.0):
+        super().__init__()
+        self.nheads = nheads
+        self.dropout_p = dropout_p
+        self.query_proj = nn.Linear(E_q, E_total)
+        self.key_proj = nn.Linear(E_k, E_total)
+        self.value_proj = nn.Linear(E_v, E_total)
+        E_out = E_q
+        self.out_proj = nn.Linear(E_total, E_out)
+        assert E_total % nheads == 0, "Embedding dim is not divisible by nheads"
+        self.E_head = E_total // nheads
+
+    def forward(self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor) -> torch.Tensor:
+        """
+        Forward pass; runs the following process:
+            1. Apply input projection
+            2. Split heads and prepare for SDPA
+            3. Run SDPA
+            4. Apply output projection
+
+        Args:
+            query (torch.Tensor): query of shape (N, L_t, E_q)
+            key (torch.Tensor): key of shape (N, L_s, E_k)
+            value (torch.Tensor): value of shape (N, L_s, E_v)
+
+        Returns:
+            attn_output (torch.Tensor): output of shape (N, L_t, E_q)
+        """
+        # Step 1. Apply input projection
+        # TODO: demonstrate packed projection
+        query = self.query_proj(query)
+        key = self.key_proj(key)
+        value = self.value_proj(value)
+
+        # Step 2. Split heads and prepare for SDPA
+        # reshape query, key, value to separate by head
+        # (N, L_t, E_total) -> (N, L_t, nheads, E_head) -> (N, nheads, L_t, E_head)
+        query = query.unflatten(-1, [self.nheads, self.E_head]).transpose(1, 2)
+        # (N, L_s, E_total) -> (N, L_s, nheads, E_head) -> (N, nheads, L_s, E_head)
+        key = key.unflatten(-1, [self.nheads, self.E_head]).transpose(1, 2)
+        # (N, L_s, E_total) -> (N, L_s, nheads, E_head) -> (N, nheads, L_s, E_head)
+        value = value.unflatten(-1, [self.nheads, self.E_head]).transpose(1, 2)
+
+        # Step 3. Run SDPA
+        # (N, nheads, L_t, E_head)
+        attn_output = F.scaled_dot_product_attention(
+            query, key, value, dropout_p=dropout_p, is_causal=True)
+        # (N, nheads, L_t, E_head) -> (N, L_t, nheads, E_head) -> (N, L_t, E_total)
+        attn_output = attn_output.transpose(1, 2).flatten(-2)
+
+        # Step 4. Apply output projection
+        # (N, L_t, E_total) -> (N, L_t, E_out)
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output
+
+######################################################################
+# set hyperparameters following `the Transformer paper <https://arxiv.org/pdf/1706.03762.pdf>`__
+N = 512
+E_q, E_k, E_v, E_total = 512, 512, 512, 512
+E_out = E_q
+nheads = 8
+
+######################################################################
+# except for dropout probability: set to 0 for correctness check
+dropout_p = 0.0
+
+######################################################################
+# Let us generate some realistic fake data from Zipf's law.
+def zipf_sentence_lengths(alpha: float, batch_size: int) -> torch.Tensor:
+    # generate fake corpus by unigram Zipf distribution
+    # from wikitext-2 corpus, we get rank "." = 3, "!" = 386, "?" = 858
+    sentence_lengths = np.empty(batch_size, dtype=int)
+    for ibatch in range(batch_size):
+        sentence_lengths[ibatch] = 1
+        word = np.random.zipf(alpha)
+        while word != 3 and word != 386 and word != 858:
+            sentence_lengths[ibatch] += 1
+            word = np.random.zipf(alpha)
+    return torch.tensor(sentence_lengths)
+
+######################################################################
+# Create nested tensor batch inputs
+def gen_batch(N, E_q, E_k, E_v, device):
+    # generate semi-realistic data using Zipf distribution for sentence lengths
+    sentence_lengths = zipf_sentence_lengths(alpha=1.2, batch_size=N)
+
+    # Note: the torch.jagged layout is a nested tensor layout that supports a single ragged
+    # dimension and works with torch.compile. The batch items each have shape (B, S*, D)
+    # where B = batch size, S* = ragged sequence length, and D = embedding dimension.
+    query = torch.nested.nested_tensor([
+        torch.randn(l.item(), E_q, device=device)
+        for l in sentence_lengths
+    ], layout=torch.jagged)
+
+    key = torch.nested.nested_tensor([
+        torch.randn(s.item(), E_k, device=device)
+        for s in sentence_lengths
+    ], layout=torch.jagged)
+
+    value = torch.nested.nested_tensor([
+        torch.randn(s.item(), E_v, device=device)
+        for s in sentence_lengths
+    ], layout=torch.jagged)
+
+    return query, key, value, sentence_lengths
+
+query, key, value, sentence_lengths = gen_batch(N, E_q, E_k, E_v, device)
+
+######################################################################
+# Generate padded forms of query, key, value for comparison
+def jagged_to_padded(jt, padding_val):
+    # TODO: do jagged -> padded directly when this is supported
+    return torch.nested.to_padded_tensor(
+        torch.nested.nested_tensor(list(jt.unbind())),
+        padding_val)
+
+padded_query, padded_key, padded_value = (
+    jagged_to_padded(t, 0.0) for t in (query, key, value)
+)
+
+######################################################################
+# Construct the model
+mha = MultiHeadAttention(E_q, E_k, E_v, E_total, nheads, dropout_p).to(device=device)
+
+######################################################################
+# Check correctness and performance
+def benchmark(func, *args, **kwargs):
+    torch.cuda.synchronize()
+    begin = timeit.default_timer()
+    output = func(*args, **kwargs)
+    torch.cuda.synchronize()
+    end = timeit.default_timer()
+    return output, (end - begin)
+
+output_nested, time_nested = benchmark(mha, query, key, value)
+output_padded, time_padded = benchmark(mha, padded_query, padded_key, padded_value)
+
+# padding-specific step: remove output projection bias from padded entries for fair comparison
+for i, entry_length in enumerate(sentence_lengths):
+    output_padded[i, entry_length:] = 0.0
+
+print("=== without torch.compile ===")
+print("nested and padded calculations differ by", (jagged_to_padded(output_nested, 0.0) - output_padded).abs().max().item())
+print("nested tensor multi-head attention takes", time_nested, "seconds")
+print("padded tensor multi-head attention takes", time_padded, "seconds")
+
+# warm up compile first...
+compiled_mha = torch.compile(mha)
+compiled_mha(query, key, value)
+# ...now benchmark
+compiled_output_nested, compiled_time_nested = benchmark(
+    compiled_mha, query, key, value)
+
+# warm up compile first...
+compiled_mha(padded_query, padded_key, padded_value)
+# ...now benchmark
+compiled_output_padded, compiled_time_padded = benchmark(
+    compiled_mha, padded_query, padded_key, padded_value)
+
+# padding-specific step: remove output projection bias from padded entries for fair comparison
+for i, entry_length in enumerate(sentence_lengths):
+    compiled_output_padded[i, entry_length:] = 0.0
+
+print("=== with torch.compile ===")
+print("nested and padded calculations differ by", (jagged_to_padded(compiled_output_nested, 0.0) - compiled_output_padded).abs().max().item())
+print("nested tensor multi-head attention takes", compiled_time_nested, "seconds")
+print("padded tensor multi-head attention takes", compiled_time_padded, "seconds")
+
+######################################################################
+# Note that without ``torch.compile``, the overhead of the python subclass nested tensor
+# can make it slower than the equivalent computation on padded tensors. However, once
+# ``torch.compile`` is enabled, operating on nested tensors gives a multiple x speedup.
+# Avoiding wasted computation on padding becomes only more valuable as the percentage
+# of padding in the batch increases.
+print(f"Nested speedup: {compiled_time_padded / compiled_time_nested:.3f}")
+
+######################################################################
+# Conclusion
+# ----------
+# In this tutorial, we have learned how to perform basic operations with nested tensors and
+# how implement multi-head attention for transformers in a way that avoids computation on padding.
+# For more information, check out the docs for the
+# `torch.nested <https://pytorch.org/docs/stable/nested.html>`__ namespace.
+#
+# See Also
+# --------
+#
+# * `Accelerating PyTorch Transformers by replacing nn.Transformer with Nested Tensors and torch.compile <https://docs.pytorch.org/tutorials/intermediate/transformer_building_blocks.html>`__
diff --git a/unstable_source/nnapi_mobilenetv2.rst b/unstable_source/nnapi_mobilenetv2.rst
new file mode 100644
index 00000000000..ef7edc92d12
--- /dev/null
+++ b/unstable_source/nnapi_mobilenetv2.rst
@@ -0,0 +1,10 @@
+(Beta) Convert MobileNetV2 to NNAPI
+========================================
+
+PyTorch Mobile is no longer actively supported. Please check out `ExecuTorch <https://github.com/pytorch/executorch>`__.
+
+Redirecting in 3 seconds...
+
+.. raw:: html
+
+    <meta http-equiv="Refresh" content="3; url='https://pytorch.org/executorch/stable/index.html'" />
diff --git a/unstable_source/openvino_quantizer.rst b/unstable_source/openvino_quantizer.rst
new file mode 100644
index 00000000000..9412c772204
--- /dev/null
+++ b/unstable_source/openvino_quantizer.rst
@@ -0,0 +1,250 @@
+PyTorch 2 Export Quantization for OpenVINO torch.compile Backend
+===========================================================================
+
+**Authors**: `Daniil Lyakhov <https://github.com/daniil-lyakhov>`_,  `Aamir Nazir <https://github.com/anzr299>`_,  `Alexander Suslov <https://github.com/alexsu52>`_, `Yamini Nimmagadda <https://github.com/ynimmaga>`_, `Alexander Kozlov <https://github.com/AlexKoff88>`_
+
+Prerequisites
+--------------
+- `PyTorch 2 Export Post Training Quantization <https://pytorch.org/tutorials/prototype/pt2e_quant_ptq.html>`_
+- `How to Write a Quantizer for PyTorch 2 Export Quantization <https://pytorch.org/tutorials/prototype/pt2e_quantizer.html>`_
+
+Introduction
+--------------
+
+.. note::
+
+    This is an experimental feature, the quantization API is subject to change.
+
+This tutorial demonstrates how to use ``OpenVINOQuantizer`` from `Neural Network Compression Framework (NNCF) <https://github.com/openvinotoolkit/nncf/tree/develop>`_ in PyTorch 2 Export Quantization flow to generate a quantized model customized for the `OpenVINO torch.compile backend <https://docs.openvino.ai/2024/openvino-workflow/torch-compile.html>`_ and explains how to lower the quantized model into the `OpenVINO <https://docs.openvino.ai/2024/index.html>`_ representation.
+``OpenVINOQuantizer`` unlocks the full potential of low-precision OpenVINO kernels due to the placement of quantizers designed specifically for the OpenVINO.
+
+The PyTorch 2 export quantization flow uses ``torch.export`` to capture the model into a graph and performs quantization transformations on top of the ATen graph.
+This approach is expected to have significantly higher model coverage, improved flexibility, and a simplified UX.
+OpenVINO backend compiles the FX Graph generated by TorchDynamo into an optimized OpenVINO model.
+
+The quantization flow mainly includes four steps:
+
+- Step 1: Capture the FX Graph from the eager Model based on the `torch export mechanism <https://pytorch.org/docs/main/export.html>`_.
+- Step 2: Apply the PyTorch 2 Export Quantization flow with OpenVINOQuantizer based on the captured FX Graph.
+- Step 3: Lower the quantized model into OpenVINO representation with the `torch.compile <https://docs.openvino.ai/2024/openvino-workflow/torch-compile.html>`_ API.
+- Optional step 4: : Improve quantized model metrics via `quantize_pt2e <https://openvinotoolkit.github.io/nncf/autoapi/nncf/experimental/torch/fx/index.html#nncf.experimental.torch.fx.quantize_pt2e>`_ method.
+
+The high-level architecture of this flow could look like this:
+
+::
+
+    float_model(Python)                          Example Input
+        \                                              /
+         \                                            /
+    —--------------------------------------------------------
+    |                         export                       |
+    —--------------------------------------------------------
+                                |
+                        FX Graph in ATen
+                                |
+                                |           OpenVINOQuantizer
+                                |                 /
+    —--------------------------------------------------------
+    |                      prepare_pt2e                     |
+    |                           |                           |
+    |                       Calibrate
+    |                           |                           |
+    |                      convert_pt2e                     |
+    —--------------------------------------------------------
+                                |
+                         Quantized Model
+                                |
+    —--------------------------------------------------------
+    |                  Lower into Inductor                  |
+    —--------------------------------------------------------
+                                |
+                          OpenVINO model
+
+Post Training Quantization
+----------------------------
+
+Now, we will walk you through a step-by-step tutorial for how to use it with `torchvision resnet18 model <https://download.pytorch.org/models/resnet18-f37072fd.pth>`_
+for post training quantization.
+
+Prerequisite: OpenVINO and NNCF installation
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+OpenVINO and NNCF could be easily installed via `pip distribution <https://docs.openvino.ai/2024/get-started/install-openvino.html>`_:
+
+.. code-block:: bash
+
+    pip install -U pip
+    pip install openvino, nncf
+
+
+1. Capture FX Graph
+^^^^^^^^^^^^^^^^^^^^^
+
+We will start by performing the necessary imports, capturing the FX Graph from the eager module.
+
+.. code-block:: python
+
+    import copy
+    import openvino.torch
+    import torch
+    import torchvision.models as models
+    from torch.ao.quantization.quantize_pt2e import convert_pt2e
+    from torch.ao.quantization.quantize_pt2e import prepare_pt2e
+
+    import nncf.torch
+
+    # Create the Eager Model
+    model_name = "resnet18"
+    model = models.__dict__[model_name](pretrained=True)
+
+    # Set the model to eval mode
+    model = model.eval()
+
+    # Create the data, using the dummy data here as an example
+    traced_bs = 50
+    x = torch.randn(traced_bs, 3, 224, 224)
+    example_inputs = (x,)
+
+    # Capture the FX Graph to be quantized
+    with torch.no_grad(), nncf.torch.disable_patching():
+        exported_model = torch.export.export(model, example_inputs).module()
+
+
+
+2. Apply Quantization
+^^^^^^^^^^^^^^^^^^^^^^^
+
+After we capture the FX Module to be quantized, we will import the OpenVINOQuantizer.
+
+
+.. code-block:: python
+
+    from nncf.experimental.torch.fx import OpenVINOQuantizer
+
+    quantizer = OpenVINOQuantizer()
+
+``OpenVINOQuantizer`` has several optional parameters that allow tuning the quantization process to get a more accurate model.
+Below is the list of essential parameters and their description:
+
+
+* ``preset`` - defines quantization scheme for the model. Two types of presets are available:
+
+    * ``PERFORMANCE`` (default) - defines symmetric quantization of weights and activations
+
+    * ``MIXED`` - weights are quantized with symmetric quantization and the activations are quantized with asymmetric quantization. This preset is recommended for models with non-ReLU and asymmetric activation functions, e.g. ELU, PReLU, GELU, etc.
+
+    .. code-block:: python
+
+        OpenVINOQuantizer(preset=nncf.QuantizationPreset.MIXED)
+
+* ``model_type`` - used to specify quantization scheme required for specific type of the model. Transformer is the only supported special quantization scheme to preserve accuracy after quantization of Transformer models (BERT, Llama, etc.). None is default, i.e. no specific scheme is defined.
+
+    .. code-block:: python
+
+        OpenVINOQuantizer(model_type=nncf.ModelType.Transformer)
+
+* ``ignored_scope`` - this parameter can be used to exclude some layers from the quantization process to preserve the model accuracy.  For example, when you want to exclude the last layer of the model from quantization.  Below are some examples of how to use this parameter:
+
+    .. code-block:: python
+
+        #Exclude by layer name:
+        names = ['layer_1', 'layer_2', 'layer_3']
+        OpenVINOQuantizer(ignored_scope=nncf.IgnoredScope(names=names))
+
+        #Exclude by layer type:
+        types = ['Conv2d', 'Linear']
+        OpenVINOQuantizer(ignored_scope=nncf.IgnoredScope(types=types))
+
+        #Exclude by regular expression:
+        regex = '.*layer_.*'
+        OpenVINOQuantizer(ignored_scope=nncf.IgnoredScope(patterns=regex))
+
+        #Exclude by subgraphs:
+        # In this case, all nodes along all simple paths in the graph
+        # from input to output nodes will be excluded from the quantization process.
+        subgraph = nncf.Subgraph(inputs=['layer_1', 'layer_2'], outputs=['layer_3'])
+        OpenVINOQuantizer(ignored_scope=nncf.IgnoredScope(subgraphs=[subgraph]))
+
+
+* ``target_device`` - defines the target device, the specificity of which will be taken into account during optimization. The following values are supported: ``ANY`` (default), ``CPU``, ``CPU_SPR``, ``GPU``, and ``NPU``.
+
+    .. code-block:: python
+
+        OpenVINOQuantizer(target_device=nncf.TargetDevice.CPU)
+
+For further details on `OpenVINOQuantizer` please see the `documentation <https://openvinotoolkit.github.io/nncf/autoapi/nncf/experimental/torch/fx/index.html#nncf.experimental.torch.fx.OpenVINOQuantizer>`_.
+
+After we import the backend-specific Quantizer, we will prepare the model for post-training quantization.
+``prepare_pt2e`` folds BatchNorm operators into preceding Conv2d operators, and inserts observers in appropriate places in the model.
+
+.. code-block:: python
+
+    prepared_model = prepare_pt2e(exported_model, quantizer)
+
+Now, we will calibrate the ``prepared_model`` after the observers are inserted in the model.
+
+.. code-block:: python
+
+    # We use the dummy data as an example here
+    prepared_model(*example_inputs)
+
+Finally, we will convert the calibrated Model to a quantized Model. ``convert_pt2e`` takes a calibrated model and produces a quantized model.
+
+.. code-block:: python
+
+    quantized_model = convert_pt2e(prepared_model, fold_quantize=False)
+
+After these steps, we finished running the quantization flow, and we will get the quantized model.
+
+
+3. Lower into OpenVINO representation
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+After that the FX Graph can utilize OpenVINO optimizations using `torch.compile(…, backend=”openvino”) <https://docs.openvino.ai/2024/openvino-workflow/torch-compile.html>`_ functionality.
+
+.. code-block:: python
+
+    with torch.no_grad(), nncf.torch.disable_patching():
+        optimized_model = torch.compile(quantized_model, backend="openvino")
+
+        # Running some benchmark
+        optimized_model(*example_inputs)
+
+
+
+The optimized model is using low-level kernels designed specifically for Intel CPU.
+This should significantly speed up inference time in comparison with the eager model.
+
+4. Optional: Improve quantized model metrics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+NNCF implements advanced quantization algorithms like `SmoothQuant <https://arxiv.org/abs/2211.10438>`_ and `BiasCorrection <https://arxiv.org/abs/1906.04721>`_, which help
+to improve the quantized model metrics while minimizing the output discrepancies between the original and compressed models.
+These advanced NNCF algorithms can be accessed via the NNCF `quantize_pt2e` API:
+
+.. code-block:: python
+
+    from nncf.experimental.torch.fx import quantize_pt2e
+
+    calibration_loader = torch.utils.data.DataLoader(...)
+
+
+    def transform_fn(data_item):
+        images, _ = data_item
+        return images
+
+
+    calibration_dataset = nncf.Dataset(calibration_loader, transform_fn)
+    quantized_model = quantize_pt2e(
+        exported_model, quantizer, calibration_dataset, smooth_quant=True, fast_bias_correction=False
+    )
+
+
+For further details, please see the `documentation <https://openvinotoolkit.github.io/nncf/autoapi/nncf/experimental/torch/fx/index.html#nncf.experimental.torch.fx.quantize_pt2e>`_
+and a complete `example on Resnet18 quantization <https://github.com/openvinotoolkit/nncf/blob/develop/examples/post_training_quantization/torch_fx/resnet18/README.md>`_.
+
+Conclusion
+------------
+
+This tutorial introduces how to use torch.compile with the OpenVINO backend and the OpenVINO quantizer.
+For more details on NNCF and the NNCF Quantization Flow for PyTorch models, refer to the `NNCF Quantization Guide <https://docs.openvino.ai/2025/openvino-workflow/model-optimization-guide/quantizing-models-post-training/basic-quantization-flow.html.>`_.
+For additional information, check out the `OpenVINO Deployment via torch.compile Documentation <https://docs.openvino.ai/2024/openvino-workflow/torch-compile.html>`_.
diff --git a/unstable_source/python_extension_autoload.rst b/unstable_source/python_extension_autoload.rst
new file mode 100644
index 00000000000..ee7af5d49ef
--- /dev/null
+++ b/unstable_source/python_extension_autoload.rst
@@ -0,0 +1,184 @@
+Autoloading Out-of-Tree Extension
+=================================
+
+**Author:** `Yuanhao Ji <https://github.com/shink>`__
+
+The extension autoloading mechanism enables PyTorch to automatically
+load out-of-tree backend extensions without explicit import statements. This
+feature is beneficial for users as it enhances their
+experience and enables them to follow the familiar PyTorch device
+programming model without having to explicitly load or import device-specific
+extensions. Additionally, it facilitates effortless
+adoption of existing PyTorch applications with zero-code changes on
+out-of-tree devices. For further details, refer to the
+`[RFC] Autoload Device Extension <https://github.com/pytorch/pytorch/issues/122468>`_.
+
+.. grid:: 2
+
+    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn
+       :class-card: card-prerequisites
+
+       * How to use out-of-tree extension autoloading in PyTorch
+       * Review examples with Intel Gaudi HPU, Huawei Ascend NPU
+
+    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites
+       :class-card: card-prerequisites
+
+       * PyTorch v2.5 or later
+
+.. note::
+
+    This feature is enabled by default and can be disabled by using
+    ``export TORCH_DEVICE_BACKEND_AUTOLOAD=0``.
+    If you get an error like this: "Failed to load the backend extension",
+    this error is independent with PyTorch, you should disable this feature
+    and ask the out-of-tree extension maintainer for help.
+
+How to apply this mechanism to out-of-tree extensions?
+------------------------------------------------------
+
+For instance, suppose you have a backend named ``foo`` and a corresponding package named ``torch_foo``. Ensure that
+your package is compatible with PyTorch 2.5 or later and includes the following snippet in its ``__init__.py`` file:
+
+.. code-block:: python
+
+    def _autoload():
+        print("Check things are working with `torch.foo.is_available()`.")
+
+Then, the only thing you need to do is define an entry point within your Python package:
+
+.. code-block:: python
+
+    setup(
+        name="torch_foo",
+        version="1.0",
+        entry_points={
+            "torch.backends": [
+                "torch_foo = torch_foo:_autoload",
+            ],
+        }
+    )
+
+Now you can import the ``torch_foo`` module by simply adding the ``import torch`` statement without the need to add ``import torch_foo``:
+
+.. code-block:: python
+
+    >>> import torch
+    Check things are working with `torch.foo.is_available()`.
+    >>> torch.foo.is_available()
+    True
+
+In some cases, you might encounter issues with circular imports. The examples below demonstrate how you can address them.
+
+Examples
+^^^^^^^^
+
+In this example, we will be using Intel Gaudi HPU and Huawei Ascend NPU to determine how to
+integrate your out-of-tree extension with PyTorch using the autoloading feature.
+
+`habana_frameworks.torch`_ is a Python package that enables users to run
+PyTorch programs on Intel Gaudi by using the PyTorch ``HPU`` device key.
+
+.. _habana_frameworks.torch: https://docs.habana.ai/en/latest/PyTorch/Getting_Started_with_PyTorch_and_Gaudi/Getting_Started_with_PyTorch.html
+
+``habana_frameworks.torch`` is a submodule of ``habana_frameworks``, we add an entry point to
+``__autoload()`` in ``habana_frameworks/setup.py``:
+
+.. code-block:: diff
+
+    setup(
+        name="habana_frameworks",
+        version="2.5",
+    +   entry_points={
+    +       'torch.backends': [
+    +           "device_backend = habana_frameworks:__autoload",
+    +       ],
+    +   }
+    )
+
+In ``habana_frameworks/init.py``, we use a global variable to track if our module has been loaded:
+
+.. code-block:: python
+
+    import os
+
+    is_loaded = False  # A member variable of habana_frameworks module to track if our module has been imported
+
+    def __autoload():
+        # This is an entrypoint for pytorch autoload mechanism
+        # If the following condition is true, that means our backend has already been loaded, either explicitly
+        # or by the autoload mechanism and importing it again should be skipped to avoid circular imports
+        global is_loaded
+        if is_loaded:
+            return
+        import habana_frameworks.torch
+
+In ``habana_frameworks/torch/init.py``, we prevent circular imports by updating the state of the global variable:
+
+.. code-block:: python
+
+    import os
+
+    # This is to prevent torch autoload mechanism from causing circular imports
+    import habana_frameworks
+
+    habana_frameworks.is_loaded = True
+
+`torch_npu`_ enables users to run PyTorch programs on Huawei Ascend NPU, it
+leverages the ``PrivateUse1`` device key and exposes the device name
+as ``npu`` to the end users.
+
+.. _torch_npu: https://github.com/Ascend/pytorch
+
+We define an entry point in `torch_npu/setup.py`_:
+
+.. _torch_npu/setup.py: https://github.com/Ascend/pytorch/blob/master/setup.py#L618
+
+.. code-block:: diff
+
+    setup(
+        name="torch_npu",
+        version="2.5",
+    +   entry_points={
+    +       'torch.backends': [
+    +           'torch_npu = torch_npu:_autoload',
+    +       ],
+    +   }
+    )
+
+Unlike ``habana_frameworks``, ``torch_npu`` uses the environment variable ``TORCH_DEVICE_BACKEND_AUTOLOAD``
+to control the autoloading process. For example, we set it to ``0`` to disable autoloading to prevent circular imports:
+
+.. code-block:: python
+
+    # Disable autoloading before running 'import torch'
+    os.environ['TORCH_DEVICE_BACKEND_AUTOLOAD'] = '0'
+
+    import torch
+
+How it works
+------------
+
+.. image:: ../_static/img/python_extension_autoload_impl.png
+   :alt: Autoloading implementation
+   :align: center
+
+Autoloading is implemented based on Python's `Entrypoints
+<https://packaging.python.org/en/latest/specifications/entry-points/>`_
+mechanism. We discover and load all of the specific entry points
+in ``torch/__init__.py`` that are defined by out-of-tree extensions.
+
+As shown above, after installing ``torch_foo``, your Python module can be imported
+when loading the entrypoint that you have defined, and then you can do some necessary work when
+calling it.
+
+See the implementation in this pull request: `[RFC] Add support for device extension autoloading
+<https://github.com/pytorch/pytorch/pull/127074>`_.
+
+Conclusion
+----------
+
+In this tutorial, we learned about the out-of-tree extension autoloading mechanism in PyTorch, which automatically
+loads backend extensions eliminating the need to add additional import statements. We also learned how to apply
+this mechanism to out-of-tree extensions by defining an entry point and how to prevent circular imports.
+We also reviewed an example on how to use the autoloading mechanism with Intel Gaudi HPU and Huawei Ascend NPU.
diff --git a/unstable_source/tracing_based_selective_build.rst b/unstable_source/tracing_based_selective_build.rst
new file mode 100644
index 00000000000..a1b56072051
--- /dev/null
+++ b/unstable_source/tracing_based_selective_build.rst
@@ -0,0 +1,10 @@
+(prototype) Tracing-based Selective Build Mobile Interpreter in Android and iOS
+===============================================================================
+
+This tutorial has been replaced with a newer tutorial on this topic: https://pytorch.org/executorch/stable/kernel-library-selective-build.html
+
+Redirecting in 3 seconds...
+
+.. raw:: html
+
+    <meta http-equiv="Refresh" content="3; url='https://pytorch.org/executorch/stable/kernel-library-selective-build.html'" />
diff --git a/unstable_source/vmap_recipe.py b/unstable_source/vmap_recipe.py
new file mode 100644
index 00000000000..53d5377c3cc
--- /dev/null
+++ b/unstable_source/vmap_recipe.py
@@ -0,0 +1,123 @@
+"""
+torch.vmap
+==========
+This tutorial introduces torch.vmap, an autovectorizer for PyTorch operations.
+torch.vmap is a prototype feature and cannot handle a number of use cases;
+however, we would like to gather use cases for it to inform the design. If you
+are considering using torch.vmap or think it would be really cool for something,
+please contact us at https://github.com/pytorch/pytorch/issues/42368.
+
+So, what is vmap?
+-----------------
+vmap is a higher-order function. It accepts a function `func` and returns a new
+function that maps `func` over some dimension of the inputs. It is highly
+inspired by JAX's vmap.
+
+Semantically, vmap pushes the "map" into PyTorch operations called by `func`,
+effectively vectorizing those operations.
+"""
+import torch
+# NB: vmap is only available on nightly builds of PyTorch.
+# You can download one at pytorch.org if you're interested in testing it out.
+from torch import vmap
+
+####################################################################
+# The first use case for vmap is making it easier to handle
+# batch dimensions in your code. One can write a function `func`
+# that runs on examples and then lift it to a function that can
+# take batches of examples with `vmap(func)`. `func` however
+# is subject to many restrictions:
+#
+# - it must be functional (one cannot mutate a Python data structure
+#   inside of it), with the exception of in-place PyTorch operations.
+# - batches of examples must be provided as Tensors. This means that
+#   vmap doesn't handle variable-length sequences out of the box.
+#
+# One example of using `vmap` is to compute batched dot products. PyTorch
+# doesn't provide a batched `torch.dot` API; instead of unsuccessfully
+# rummaging through docs, use `vmap` to construct a new function:
+
+torch.dot                            # [D], [D] -> []
+batched_dot = torch.vmap(torch.dot)  # [N, D], [N, D] -> [N]
+x, y = torch.randn(2, 5), torch.randn(2, 5)
+batched_dot(x, y)
+
+####################################################################
+# `vmap` can be helpful in hiding batch dimensions, leading to a simpler
+# model authoring experience.
+batch_size, feature_size = 3, 5
+weights = torch.randn(feature_size, requires_grad=True)
+
+# Note that model doesn't work with a batch of feature vectors because
+# torch.dot must take 1D tensors. It's pretty easy to rewrite this
+# to use `torch.matmul` instead, but if we didn't want to do that or if
+# the code is more complicated (e.g., does some advanced indexing
+# shenanigins), we can simply call `vmap`. `vmap` batches over ALL
+# inputs, unless otherwise specified (with the in_dims argument,
+# please see the documentation for more details).
+def model(feature_vec):
+    # Very simple linear model with activation
+    return feature_vec.dot(weights).relu()
+
+examples = torch.randn(batch_size, feature_size)
+result = torch.vmap(model)(examples)
+expected = torch.stack([model(example) for example in examples.unbind()])
+assert torch.allclose(result, expected)
+
+####################################################################
+# `vmap` can also help vectorize computations that were previously difficult
+# or impossible to batch. This bring us to our second use case: batched
+# gradient computation.
+#
+# - https://github.com/pytorch/pytorch/issues/8304
+# - https://github.com/pytorch/pytorch/issues/23475
+#
+# The PyTorch autograd engine computes vjps (vector-Jacobian products).
+# Using vmap, we can compute (batched vector) - jacobian products.
+#
+# One example of this is computing a full Jacobian matrix (this can also be
+# applied to computing a full Hessian matrix).
+# Computing a full Jacobian matrix for some function f: R^N -> R^N usually
+# requires N calls to `autograd.grad`, one per Jacobian row.
+
+# Setup
+N = 5
+def f(x):
+    return x ** 2
+
+x = torch.randn(N, requires_grad=True)
+y = f(x)
+basis_vectors = torch.eye(N)
+
+# Sequential approach
+jacobian_rows = [torch.autograd.grad(y, x, v, retain_graph=True)[0]
+                 for v in basis_vectors.unbind()]
+jacobian = torch.stack(jacobian_rows)
+
+# Using `vmap`, we can vectorize the whole computation, computing the
+# Jacobian in a single call to `autograd.grad`.
+def get_vjp(v):
+    return torch.autograd.grad(y, x, v)[0]
+
+jacobian_vmap = vmap(get_vjp)(basis_vectors)
+assert torch.allclose(jacobian_vmap, jacobian)
+
+####################################################################
+# The third main use case for vmap is computing per-sample-gradients.
+# This is something that the vmap prototype cannot handle performantly
+# right now. We're not sure what the API for computing per-sample-gradients
+# should be, but if you have ideas, please comment in
+# https://github.com/pytorch/pytorch/issues/7786.
+
+def model(sample, weight):
+    # do something...    
+    return torch.dot(sample, weight)
+
+def grad_sample(sample):
+    return torch.autograd.functional.vjp(lambda weight: model(sample), weight)[1]
+
+# The following doesn't actually work in the vmap prototype. But it
+# could be an API for computing per-sample-gradients.
+
+# batch_of_samples = torch.randn(64, 5)
+# vmap(grad_sample)(batch_of_samples)